In [9]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import time, random
from tqdm import tqdm
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, QuantileTransformer
import datetime
import argparse
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import transforms, utils, datasets
from torch.utils.data import Dataset, DataLoader

from ember_utils import *
from ember_model import *
from ember_pjr_utils import *


def count_replay_samples(data_dir, task_months, replay_portion):
    
    replay_samples_count = 0
    for month in task_months[:-1]:
        pre_X_tr, pre_Y_tr = get_month_data(data_dir, month)
        pre_X_tr, pre_Y_tr = get_partial_data(pre_X_tr, pre_Y_tr, replay_portion)
        
        replay_samples_count += len(pre_Y_tr)
    
    print(f'#of replay samples {replay_samples_count}')

    return replay_samples_count

all_task_months = ['2018-01', '2018-02', '2018-03', '2018-04', '2018-05', '2018-06',
                   '2018-07', '2018-08', '2018-09', '2018-10', '2018-11', '2018-12']

#data_dir = '../../ember2018/month_based_processing/'
data_dir = '../../month_based_processing_with_family_labels/'


replay_portion = 0.50

ReplaySamples_Count = []

for task_month in range(len(all_task_months)):

    current_task = all_task_months[task_month]
    task_months = all_task_months[:task_month+1]
    print(f'Current Task {current_task} with Replay {replay_portion*100}%')


    numreplaysamples = count_replay_samples(data_dir, task_months, replay_portion)
    
    ReplaySamples_Count.append(numreplaysamples)
    
print(f'\n\n {np.array(ReplaySamples_Count)}')

Current Task 2018-01 with Replay 50.0%
#of replay samples 0
Current Task 2018-02 with Replay 50.0%
#of replay samples 27861
Current Task 2018-03 with Replay 50.0%
#of replay samples 52222
Current Task 2018-04 with Replay 50.0%
#of replay samples 70908
Current Task 2018-05 with Replay 50.0%
#of replay samples 94344
Current Task 2018-06 with Replay 50.0%
#of replay samples 115004
Current Task 2018-07 with Replay 50.0%
#of replay samples 136784
Current Task 2018-08 with Replay 50.0%
#of replay samples 159923
Current Task 2018-09 with Replay 50.0%
#of replay samples 180364
Current Task 2018-10 with Replay 50.0%
#of replay samples 208610
Current Task 2018-11 with Replay 50.0%
#of replay samples 247496
Current Task 2018-12 with Replay 50.0%
#of replay samples 292496


 [     0  27861  52222  70908  94344 115004 136784 159923 180364 208610
 247496 292496]


In [13]:
import numpy as np
import random
import datetime
import time
import os
from collections import defaultdict
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from tqdm import tqdm
import argparse
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import transforms, utils, datasets
from torch.utils.data import Dataset, DataLoader

from ember_utils import *
from ember_model import *
from ember_pjr_utils import *



def create_parent_folder(file_path):
    if not os.path.exists(os.path.dirname(file_path)):
        os.makedirs(os.path.dirname(file_path))



def get_family_labeled_month_data(data_dir, month, train=True):
    
    if train:
        data_dir = data_dir + str(month) + '/'
        XY_train = np.load(data_dir + 'XY_train.npz')
        X_tr, Y_tr, Y_tr_family = XY_train['X_train'], XY_train['Y_train'], XY_train['Y_family_train']

        print(f'X_train {X_tr.shape} Y_train {Y_tr.shape} Y_tr_family {Y_tr_family.shape}')
        
        return X_tr, Y_tr, Y_tr_family
    else:
        data_dir = data_dir + str(month) + '/'
        XY_test = np.load(data_dir + 'XY_test.npz')
        X_test, Y_test, Y_test_family = XY_test['X_test'], XY_test['Y_test'], XY_test['Y_family_test']

        return X_test, Y_test, Y_test_family

    
    
def get_family_labeled_task_test_data(data_dir, task_months, mlp_net=False):
    
    X_te, Y_te, Y_te_family = get_family_labeled_month_data(data_dir, task_months[-1], train=False)
    
    for month in task_months[:-1]:
        pre_X_te, pre_Y_te, pre_Y_te_family = get_family_labeled_month_data(data_dir, month, train=False)
        X_te, Y_te, Y_te_family = np.concatenate((X_te, pre_X_te)), np.concatenate((Y_te, pre_Y_te)),\
                                np.concatenate((Y_te_family, pre_Y_te_family))
        

    X_test, Y_test, Y_test_family = X_te, Y_te, Y_te_family
    print(f'X_test {X_test.shape} Y_test {Y_test.shape} Y_te_family {Y_te_family.shape}')
    
    return X_test, Y_test, Y_test_family



def make_family_based_dict(X_train, Y_train, Y_train_family, task_month, global_family_dict):
    count = 0
    for x_ind, x_sample in enumerate(X_train):
        count += 1
        #print(x_ind, Y_train[x_ind])

        if Y_train[x_ind] == 0:
            global_family_dict["goodware"].append(x_sample)
        if Y_train[x_ind] == 1:
            if Y_train_family[x_ind] == '':
                global_family_dict["others_family"].append(x_sample)
            else:
                global_family_dict[Y_train_family[x_ind]].append(x_sample)

    print(f'Task {task_month} and #-of new samples stored {count}')
    
    return global_family_dict



def get_replay_samples(global_family_dict, num_samples_per_malware_family):
    pre_malware_samples = []

    cnt = 0
    for k in global_family_dict.keys():
        if k != 'goodware':
            cnt += 1
            if num_samples_per_malware_family > len(global_family_dict[k]):
                selected_family_samples = random.sample(global_family_dict[k], len(global_family_dict[k]))
            else:
                selected_family_samples = random.sample(global_family_dict[k], num_samples_per_malware_family)

            #print(selected_family_samples)
            for sample in selected_family_samples:
                pre_malware_samples.append(sample)
                
    if len(global_family_dict['goodware']) < len(pre_malware_samples):
        pre_goodware_samples = random.sample(global_family_dict['goodware'], len(global_family_dict['goodware']))
    else:
        pre_goodware_samples = random.sample(global_family_dict['goodware'], len(pre_malware_samples))

    samples_to_replay = np.concatenate((np.array(pre_goodware_samples), np.array(pre_malware_samples)))
    labels_to_replay = np.concatenate((np.zeros(len(pre_goodware_samples)), np.ones(len(pre_malware_samples))))


    #print(f'X_replay {samples_to_replay.shape} Y_replay {labels_to_replay.shape}')
    #print(f'Replay {len(pre_malware_samples)} malware samples of {len(global_family_dict.keys()) -1} families')
    #print(f'and Replay {len(pre_goodware_samples)} goodware samples')
    
    
    return samples_to_replay, labels_to_replay




all_task_months = ['2018-01', '2018-02', '2018-03', '2018-04', '2018-05', '2018-06',
                   '2018-07', '2018-08', '2018-09', '2018-10', '2018-11', '2018-12']

data_dir = '../../month_based_processing_with_family_labels/'



#num_samples_per_malware_family = 100

SamplesFamily = [50, 100, 200, 500, 1000, 5000, 10000]

for num_samples_per_malware_family in SamplesFamily:
    
    stored_global_family_dict = defaultdict(list)
    ReplaySamples_Count = []

    for task_month in range(len(all_task_months)):
        current_task = all_task_months[task_month]
        task_months = all_task_months[:task_month+1]


        X_train, Y_train, Y_train_family = get_family_labeled_month_data(data_dir, current_task)
        #X_test, Y_test, Y_test_family = get_family_labeled_task_test_data(data_dir, task_months, mlp_net=True)


        if current_task == all_task_months[0]:
            stored_global_family_dict = make_family_based_dict(X_train, Y_train, Y_train_family,\
                                                               current_task, stored_global_family_dict)
            ReplaySamples_Count.append(0)
        else:
            X_replay, Y_replay = get_replay_samples(stored_global_family_dict, num_samples_per_malware_family)
            stored_global_family_dict = make_family_based_dict(X_train, Y_train, Y_train_family,\
                                                               current_task, stored_global_family_dict)


            ReplaySamples_Count.append(len(Y_replay))

    print(f'\n\n {num_samples_per_malware_family} \n {np.array(ReplaySamples_Count)} \n\n')

X_train (55722, 2381) Y_train (55722,) Y_tr_family (55722,)
Task 2018-01 and #-of new samples stored 55722
X_train (48723, 2381) Y_train (48723,) Y_tr_family (48723,)
Task 2018-02 and #-of new samples stored 48723
X_train (37372, 2381) Y_train (37372,) Y_tr_family (37372,)
Task 2018-03 and #-of new samples stored 37372
X_train (46873, 2381) Y_train (46873,) Y_tr_family (46873,)
Task 2018-04 and #-of new samples stored 46873
X_train (41320, 2381) Y_train (41320,) Y_tr_family (41320,)
Task 2018-05 and #-of new samples stored 41320
X_train (43560, 2381) Y_train (43560,) Y_tr_family (43560,)
Task 2018-06 and #-of new samples stored 43560
X_train (46278, 2381) Y_train (46278,) Y_tr_family (46278,)
Task 2018-07 and #-of new samples stored 46278
X_train (40882, 2381) Y_train (40882,) Y_tr_family (40882,)
Task 2018-08 and #-of new samples stored 40882
X_train (56492, 2381) Y_train (56492,) Y_tr_family (56492,)
Task 2018-09 and #-of new samples stored 56492
X_train (77772, 2381) Y_train (77772,

X_train (55722, 2381) Y_train (55722,) Y_tr_family (55722,)
Task 2018-01 and #-of new samples stored 55722
X_train (48723, 2381) Y_train (48723,) Y_tr_family (48723,)
Task 2018-02 and #-of new samples stored 48723
X_train (37372, 2381) Y_train (37372,) Y_tr_family (37372,)
Task 2018-03 and #-of new samples stored 37372
X_train (46873, 2381) Y_train (46873,) Y_tr_family (46873,)
Task 2018-04 and #-of new samples stored 46873
X_train (41320, 2381) Y_train (41320,) Y_tr_family (41320,)
Task 2018-05 and #-of new samples stored 41320
X_train (43560, 2381) Y_train (43560,) Y_tr_family (43560,)
Task 2018-06 and #-of new samples stored 43560
X_train (46278, 2381) Y_train (46278,) Y_tr_family (46278,)
Task 2018-07 and #-of new samples stored 46278
X_train (40882, 2381) Y_train (40882,) Y_tr_family (40882,)
Task 2018-08 and #-of new samples stored 40882
X_train (56492, 2381) Y_train (56492,) Y_tr_family (56492,)
Task 2018-09 and #-of new samples stored 56492
X_train (77772, 2381) Y_train (77772,