In [57]:
import numpy as np
import random
import datetime
import time
import os
from collections import defaultdict
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
from tqdm import tqdm
import ast
import pandas as pd
import seaborn as sns

import argparse
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import transforms, utils, datasets
from torch.utils.data import Dataset, DataLoader
from sklearn.utils import shuffle
from ember_utils import *
from ember_model import *
from ember_pjr_utils import *

In [86]:
def GetFamilyDict(X_train, Y_train, Y_train_family,\
                  task_month, mal_cnt, global_family_dict):
    count = 0
    for x_ind, x_sample in enumerate(X_train):
        count += 1
        #print(x_ind, Y_train[x_ind])

        if Y_train[x_ind] == 0:
            global_family_dict["goodware"].append(x_sample)
        if Y_train[x_ind] == 1:
            mal_cnt += 1
            
            if Y_train_family[x_ind] == '':
                global_family_dict["others_family"].append(x_sample)
            else:
                global_family_dict[Y_train_family[x_ind]].append(x_sample)
    
    #print(f'Task {task_month} and #-of new samples stored {count}')
    
    return global_family_dict, mal_cnt


# parser = argparse.ArgumentParser()
# parser.add_argument('--num_exps', type=int, default=1, required=False, help='Number of Experiments to Run.')
# parser.add_argument('--contamination', type=float, default=0.1, required=False)
# parser.add_argument('--num_epoch', type=int, default=500, required=False)
# parser.add_argument('--batch_size', type=int, default=2000, required=False)
# parser.add_argument('--memory_budget', type=int, required=False)
# parser.add_argument('--data_dir', type=str,\
#                     default='../../month_based_processing_with_family_labels/', required=False)

# args = parser.parse_args()

def IFS_Samples(v, v_choose, get_anomalous=True, contamination=0.1):
    data_X = v
    clf = IsolationForest(max_samples=len(data_X), contamination=contamination)
    clf.fit(data_X)
    y_pred = clf.predict(data_X)
    anomalous_idx = np.where(y_pred == -1.0)
    similar_idx = np.where(y_pred == 1.0)

    assert len(anomalous_idx[0]) + len(similar_idx[0]) == len(y_pred)
    
    if get_anomalous:
        anomalous_samples_pool = list(data_X[anomalous_idx])
        similar_samples_pool = list(data_X[similar_idx])

        v_choose_split = int(np.ceil(v_choose/2))

        if len(anomalous_samples_pool) > v_choose_split:
            anomalous_samples = random.sample(anomalous_samples_pool, v_choose_split)

        else:
            anomalous_samples = anomalous_samples_pool

        if len(anomalous_samples) == v_choose_split:
            similar_samples = random.sample(similar_samples_pool, v_choose_split)
        elif len(anomalous_samples) < v_choose_split:
            v_choose_split += v_choose_split - len(anomalous_samples)
            if len(similar_samples_pool) > v_choose_split:
                similar_samples = random.sample(similar_samples_pool, v_choose_split)
            else:
                similar_samples = similar_samples_pool
        if len(anomalous_samples) > 0 and len(similar_samples) > 0: 
            anomalous_samples, similar_samples = np.array(anomalous_samples), np.array(similar_samples)
            #print(f'anomalous_samples {anomalous_samples.shape} similar_samples {similar_samples.shape}')
            replay_samples = np.concatenate((anomalous_samples, similar_samples))
        else:
            if len(anomalous_samples) <= 0:
                replay_samples = similar_samples
            if len(similar_samples) <= 0:
                replay_samples = anomalous_samples
    else:
        similar_samples_pool = list(data_X[similar_idx])
        if len(similar_samples_pool) > v_choose:
            similar_samples = random.sample(similar_samples_pool, v_choose)
        else:
            similar_samples = similar_samples_pool
            
        replay_samples = np.array(similar_samples)
        
    return replay_samples


def MixSampleCount(GBudget, MinBudget, GFamilyDict):
    
    print(f'budget unallocated {GBudget} MinBudget {MinBudget}')
    
    tmpBudget = GBudget
    
    GfamStat = {}
    
    for fam, S in GFamilyDict.items():
        if fam != 'goodware':
            GfamStat[fam] = len(S)
    
    assert len(GfamStat.keys()) == len(GFamilyDict.keys()) - 1
    
    GfamChoose = {}
    GfamTemp = {}
    
    allocated = 0
    for fam, numSample in GfamStat.items():
        if numSample > MinBudget:
            GfamChoose[fam] = MinBudget
            GfamTemp[fam] = numSample - MinBudget
            GBudget -= MinBudget
            allocated += MinBudget
        else:
            GfamChoose[fam] = numSample
            GfamTemp[fam] = 0
            GBudget -= numSample
            allocated += numSample
    
    print(f'GBudget {GBudget} allocated {allocated}')
    
    if allocated > tmpBudget:
        print(f'reduce minimum samples, budget is lower than required allocation')
    
    UnallocatedSamples = int(sum(GfamTemp.values()))

    #print(f'allocated {allocated} unallocated {GBudget} Sample remainin {UnallocatedSamples}')
    for fam, numSample in GfamTemp.items():
        if numSample != 0:
            #print(f'here ')
            allocate = int(np.round((numSample/UnallocatedSamples) * GBudget))
            #print(f'GBudget {GBudget} {np.round(numSample/UnallocatedSamples)} allocate {allocate}')
            GfamChoose[fam] += allocate
            
    #print(f'_________________')        
            
    return GfamChoose

def IFS(GFamilyDict, memory_budget,\
        goodware_ifs=False, min_samples=1, fs_option='ratio'):
    #fs_option = 'uniform'
    #memory_budget = 1000
    goodware_budget = malware_budget = int(np.ceil(memory_budget/2))
    
    num_families = len(GFamilyDict.keys()) - 1 
    pre_malSamples = []
    #cnt = 0
    #fam_cnt = 0
    
    if malware_count > malware_budget:
        if fs_option == 'mix':
            GfamChoose = MixSampleCount(malware_budget, min_samples, GFamilyDict)
    
    for k, v in GFamilyDict.items():
        
        if k != 'goodware':
            if malware_count > malware_budget:
                if fs_option != 'gifs':
                    #fam_cnt += 1
                    v = np.array(v)
                    #print(f'{k} - {len(v)}')
                    #cnt += len(v)

                    if fs_option == 'ratio':
                        v_choose = int(np.ceil((len(v) / malware_count) * malware_budget))

                    if fs_option == 'uniform':
                        v_choose = int(np.ceil(malware_budget / num_families))

                    if fs_option == 'mix':
                        #print(f'malware_count {malware_count} > malware_budget {malware_budget}')
                        v_choose = GfamChoose[k]
                        #print(f'v_choose {v_choose} **')
                        
#                         v_choose = int(np.ceil((len(v) / malware_count) * malware_budget))
#                         if v_choose < min_samples:
#                             #print(f'v_choose {v_choose} min_samples {min_samples}')
#                             v_choose = min_samples
#                         #else: print(f'v_choose {v_choose} **')                

                    if len(v) <= v_choose:
                        for i in v:
                            pre_malSamples.append(i)
                    else:
                        v = IFS_Samples(v, v_choose, get_anomalous=True, contamination=0.1)
                        for i in v:
                            pre_malSamples.append(i)
                else:
                    for i in v:
                        pre_malSamples.append(i)
            else:
                #print(f'malware_count {malware_count} <= malware_budget {malware_budget}')
                for i in v:
                    pre_malSamples.append(i)
    
    if fs_option == 'gifs':
        if malware_budget < len(pre_malSamples):
            pre_malSamples = random.sample(list(pre_malSamples), malware_budget)
    
    
    all_Goodware = GFamilyDict['goodware']
    if goodware_ifs:
        #print(f'I am here NOW.')
        pre_GoodSamples = []
        v = np.array(all_Goodware)
        v_choose = goodware_budget
        v = IFS_Samples(v, v_choose, get_anomalous=True, contamination=0.1)
        for i in v:
            pre_GoodSamples.append(i)
    else:
        if goodware_budget > len(all_Goodware):
            pre_GoodSamples = all_Goodware
        else:
            pre_GoodSamples = random.sample(list(all_Goodware), goodware_budget)
    
    print(f'Replay Goodware {len(pre_GoodSamples)} Replay Malware {len(pre_malSamples)}')
    samples_to_replay = np.concatenate((np.array(pre_GoodSamples), np.array(pre_malSamples)))
    labels_to_replay = np.concatenate((np.zeros(len(pre_GoodSamples)), np.ones(len(pre_malSamples))))
    
    X_replay, Y_replay = shuffle(samples_to_replay, labels_to_replay)
    
    return X_replay, Y_replay



all_task_months = ['2018-01', '2018-02', '2018-03', '2018-04', '2018-05', '2018-06',
                   '2018-07', '2018-08', '2018-09', '2018-10', '2018-11', '2018-12']

# data_dir = '../../month_based_processing_with_family_labels/'

patience = 5
replay_type = 'ifs'


data_dir = '../../../month_based_processing_with_family_labels/' #args.data_dir
num_exps = 1 #args.num_exps
num_epoch = 1 #args.num_epoch
batch_size = 2 #args.batch_size
memory_budget = 1 #args.memory_budget

contamination = 0.1 #args.contamination #0.1 #[0.2, 0.3, 0.4, 0.5]

exp_seeds = [random.randint(1, 99999) for i in range(num_exps)]


expSaveDir = '../IFS_Final_' + str(contamination) + '_'
resSaveDir = './IFS_Results'
expSaveFile = '/IFS_'  + str(contamination) + '_'

memory_budget=10000

for exp in exp_seeds:
    GFamilyDict = defaultdict(list)
    
    malware_count = 0
    for task_month in range(len(all_task_months[:2])):
                
        print(f'\n{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}...')
        task_start = time.time()
        
        #task_month = task_month
        current_task = all_task_months[task_month]
        task_months = all_task_months[:task_month+1]
        #print(f'Current Task {current_task} w/ {num_samples_per_malware_family} samples to Replay per Malware family.')


        model_save_dir = '../IFS_SavedModel' + '/IFSModel_' + str(memory_budget) + '/' + str(current_task) + '/'
        create_parent_folder(model_save_dir)
        
        opt_save_path = '../IFSS_SavedModel' + '/IFSOpt_' + str(memory_budget) + '/' + str(current_task) + '/'
        create_parent_folder(opt_save_path)
        
        results_save_dir =  './IFS_SavedResults_' +'/IFS_' + str(memory_budget) + '/' 
        create_parent_folder(results_save_dir)
        
        
        X_train, Y_train, Y_train_family = get_family_labeled_month_data(data_dir, current_task)
        X_test, Y_test, Y_test_family = get_family_labeled_task_test_data(data_dir, task_months, mlp_net=True)
        

        if current_task == all_task_months[0]:
            GFamilyDict, malware_count = GetFamilyDict(X_train, Y_train, Y_train_family,\
                                                   current_task, malware_count, GFamilyDict)
            num_Y_replay = 0
        else:
            X_replay, Y_replay = IFS(GFamilyDict, memory_budget, goodware_ifs=False,\
                                     min_samples=100, fs_option='mix')
            num_Y_replay = len(Y_replay)
            
            GFamilyDict, malware_count = GetFamilyDict(X_train, Y_train, Y_train_family,\
                                           current_task, malware_count, GFamilyDict)
        print()
        print(f'X_train {X_train.shape} Y_train {Y_train.shape}')
        print()


2023-08-17 00:45:57...
X_train (55722, 2381) Y_train (55722,) Y_tr_family (55722,)
X_test (6192, 2381) Y_test (6192,) Y_te_family (6192,)

X_train (55722, 2381) Y_train (55722,)


2023-08-17 00:45:58...
X_train (48723, 2381) Y_train (48723,) Y_tr_family (48723,)
X_test (11606, 2381) Y_test (11606,) Y_te_family (11606,)
budget unallocated 5000 MinBudget 100
GBudget -4847 allocated 9847
reduce minimum samples, budget is lower than required allocation


ValueError: Sample larger than population or is negative

In [68]:



GfamChoose = MixSampleCount(10000, 100, GFamilyDict)

print(sum(GfamChoose.values()))

budget unallocated 10000
9999


In [77]:

for k, v in GfamChoose.items():
    
    if v < 0:
        print(k, v)

In [66]:
def IFS_Samples(v, v_choose, get_anomalous=True, contamination=0.1):
    data_X = v
    clf = IsolationForest(max_samples=len(data_X), contamination=contamination)
    clf.fit(data_X)
    y_pred = clf.predict(data_X)
    anomalous_idx = np.where(y_pred == -1.0)
    similar_idx = np.where(y_pred == 1.0)

    assert len(anomalous_idx[0]) + len(similar_idx[0]) == len(y_pred)
    
    if get_anomalous:
        anomalous_samples_pool = list(data_X[anomalous_idx])
        similar_samples_pool = list(data_X[similar_idx])

        v_choose_split = int(np.ceil(v_choose/2))

        if len(anomalous_samples_pool) > v_choose_split:
            anomalous_samples = random.sample(anomalous_samples_pool, v_choose_split)

        else:
            anomalous_samples = anomalous_samples_pool

        if len(anomalous_samples) == v_choose_split:
            similar_samples = random.sample(similar_samples_pool, v_choose_split)
        elif len(anomalous_samples) < v_choose_split:
            v_choose_split += v_choose_split - len(anomalous_samples)
            if len(similar_samples_pool) > v_choose_split:
                similar_samples = random.sample(similar_samples_pool, v_choose_split)
            else:
                similar_samples = similar_samples_pool
        if len(anomalous_samples) > 0 and len(similar_samples) > 0: 
            anomalous_samples, similar_samples = np.array(anomalous_samples), np.array(similar_samples)
            #print(f'anomalous_samples {anomalous_samples.shape} similar_samples {similar_samples.shape}')
            replay_samples = np.concatenate((anomalous_samples, similar_samples))
        else:
            if len(anomalous_samples) <= 0:
                replay_samples = similar_samples
            if len(similar_samples) <= 0:
                replay_samples = anomalous_samples
    else:
        similar_samples_pool = list(data_X[similar_idx])
        if len(similar_samples_pool) > v_choose:
            similar_samples = random.sample(similar_samples_pool, v_choose)
        else:
            similar_samples = similar_samples_pool
            
        replay_samples = np.array(similar_samples)
        
    return replay_samples


def IFS(GFamilyDict, memory_budget, fs_option='ratio'):
    #fs_option = 'uniform'
    #memory_budget = 1000
    goodware_budget = malware_budget = int(np.ceil(memory_budget/2))
    
    num_families = len(GFamilyDict.keys()) - 1 
    pre_malSamples = []
    #cnt = 0
    #fam_cnt = 0
    
    
    for k, v in GFamilyDict.items():
        
        if k != 'goodware':
            if fs_option != 'gifs':
                #fam_cnt += 1
                v = np.array(v)
                #print(f'{k} - {len(v)}')
                #cnt += len(v)

                if fs_option == 'ratio':
                    v_choose = int(np.ceil((len(v) / malware_count) * malware_budget))
                
                if fs_option == 'uniform':
                    v_choose = int(np.ceil(num_families / malware_budget))


                if len(v) <= v_choose:
                    for i in v:
                        pre_malSamples.append(i)
                else:
                    v = IFS_Samples(v, v_choose, get_anomalous=True, contamination=0.1)
                    for i in v:
                        pre_malSamples.append(i)
            else:
                for i in v:
                    pre_malSamples.append(i)
    
    if fs_option == 'gifs':
        pre_malSamples = random.sample(list(pre_malSamples), malware_budget)
    
    all_Goodware = GFamilyDict['goodware']

    # if goodware_budget < len(pre_malSamples):
    #     goodware_budget = len(pre_malSamples)

    pre_GoodSamples = random.sample(list(all_Goodware), goodware_budget)
    
    
    samples_to_replay = np.concatenate((np.array(pre_GoodSamples), np.array(pre_malSamples)))
    labels_to_replay = np.concatenate((np.zeros(len(pre_GoodSamples)), np.ones(len(pre_malSamples))))
    
    from sklearn.utils import shuffle

    X_replay, Y_replay = shuffle(samples_to_replay, labels_to_replay)
    
    return X_replay, Y_replay



X_replay, Y_replay = IFS(GFamilyDict, memory_budget, fs_option='ratio')

print(f'{len(np.where(Y_replay==1.)[0])} - {len(np.where(Y_replay==0.)[0])}')

1757 - 500


In [65]:
print(f'{len(np.where(Y_replay==1.)[0])} - {len(np.where(Y_replay==0.)[0])}')

1397 - 500


In [54]:
from sklearn.utils import shuffle

X_replay, Y_replay = shuffle(samples_to_replay, labels_to_replay)

In [51]:
Y_replay

array([1., 1., 1., ..., 0., 0., 0.])

In [55]:
len(np.where(Y_replay==1.)[0]), len(np.where(Y_replay==0.)[0])

(1757, 1757)

In [57]:
from sklearn.utils import shuffle

X_replay, Y_replay = shuffle(samples_to_replay, labels_to_replay)

print(f'{len(np.where(Y_replay==1.)[0])} - {len(np.where(Y_replay==0.)[0])}')

1397 - 1397
