In [None]:
import os
import numpy as np
import tqdm
import json
import random
import time
import multiprocessing
from ember_features import PEFeatureExtractor
from sklearn.metrics import classification_report,accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import copy
import matplotlib.pyplot as plt
import scipy as stats
from sklearn.manifold import TSNE
from collections import defaultdict
import seaborn as sns
import hdbscan
from mpl_toolkits.mplot3d import Axes3D

plt.rcParams['font.size'] = 18
#plt.rcParams['font.family'] = "serif"
tdir = 'in'
major = 5.0
minor = 3.0
plt.rcParams['xtick.direction'] = tdir
plt.rcParams['ytick.direction'] = tdir
plt.rcParams['xtick.major.size'] = major
plt.rcParams['xtick.minor.size'] = minor
plt.rcParams['ytick.major.size'] = major
plt.rcParams['ytick.minor.size'] = minor

np.random.RandomState(42);

In [None]:
all_task_months = ['2018-01', '2018-02', '2018-03', '2018-04', '2018-05', '2018-06',
                   '2018-07', '2018-08', '2018-09', '2018-10', '2018-11', '2018-12']

malware_family_sample_count = {}
goodware_family_sample_count = {}

malware_family_samples = {}

others_family = {}

standardization = StandardScaler()

#task_based_malware_samples = {}
for task in range(0,len(all_task_months)):
    
    current_task = all_task_months[task]
    task_months = all_task_months[:task+1]
    
    
    data_dir = '/home/mr6564/continual_research/month_based_processing_with_family_labels/' + str(current_task) + '/'
    
    #print(f'Processing data for task {current_task}')
    Y_family_labels_file = data_dir + 'task_family_labels.npz'
    Y_fam_labels_ = np.load(Y_family_labels_file)
    Y_fam_labels = Y_fam_labels_['family_labels']

    y_path = os.path.join(data_dir, "y_train.dat")
    X_path = os.path.join(data_dir, "X_train.dat")
    
    
    y_ = np.memmap(y_path, dtype=np.float32, mode="r")
    N = y_.shape[0]
    
    ndim = 2381
    X_ = np.memmap(X_path, dtype=np.float32, mode="r", shape=(N, ndim))    
    #print(np.unique(y_))
    
    Y_family_labels_file = data_dir + 'task_family_labels.npz'
    Y_fam_labels_ = np.load(Y_family_labels_file)
    Y_fam_labels = Y_fam_labels_['family_labels']
    
    ##standardize 
    standard_scaler = standardization.partial_fit(X_)
    X_ = standard_scaler.transform(X_)
    X_ = np.array(X_, np.float32)
    print(f'task {current_task} samples {len(X_)}')
    
    goodware_indices = []
    malware_indices = []
    
    others_family_samples = []
    
    malware_task_family_samples = defaultdict(list)
    
    for ind, i in enumerate(y_):
        if i == 0:
            goodware_indices.append(ind)
        elif i == 1:
            malware_indices.append(ind)
            if Y_fam_labels[ind] == '':
                others_family_samples.append(X_[ind])
            else:
                malware_task_family_samples[Y_fam_labels[ind]].append(X_[ind])
                
                
                #if Y_fam_labels[ind] not in malware_task_family_samples.keys():
                #    malware_task_family_samples[Y_fam_labels[ind]]= X_ind
                #else:
                #    malware_task_family_samples[Y_fam_labels[ind]].append(X_ind)
        else:
            pass
    
    malware_family_samples[task] = malware_task_family_samples
    others_family[task] = others_family_samples
    

    Y_families_malware = Y_fam_labels[malware_indices]
    Y_families_goodware = Y_fam_labels[goodware_indices]
    
    
    
    for yfam in Y_families_malware:
        if yfam in malware_family_sample_count.keys():
            malware_family_sample_count[yfam] += 1
        else:
            malware_family_sample_count[yfam] = 1
    
    
    for yfam in Y_families_goodware:
        if yfam in goodware_family_sample_count.keys():
            goodware_family_sample_count[yfam] += 1
        else:
            goodware_family_sample_count[yfam] = 1
            
            

print(len(malware_family_sample_count.keys()), len(goodware_family_sample_count.keys()))

In [None]:
top_10 = ['xtrat', 'zbot', 'ramnit', 'sality', 'installmonster',\
              'zusy', 'emotet', 'vtflooder', 'others_family', 'fareit']

In [None]:
def get_HDBSCAN_train_test(task_id, family_ind, task_families_data):

    top_10 = ['xtrat', 'zbot', 'ramnit', 'sality', 'installmonster',\
                  'zusy', 'emotet', 'vtflooder', 'others_family', 'fareit']
    all_task_months = ['2018-01', '2018-02', '2018-03', '2018-04', '2018-05', '2018-06',
                   '2018-07', '2018-08', '2018-09', '2018-10', '2018-11', '2018-12']

    curr_task = all_task_months[task_id]
    family_name = top_10[family_ind]

    print(f'task {curr_task} family name {family_name}')
    data_X = task_families_data[task_id][family_name]

    clf = hdbscan.HDBSCAN()
    clf_labels = clf.fit(data_X)

    labels = clf_labels.labels_
    unique_labels = np.unique(labels)
    num_unique_labels = len(unique_labels)  



    valid_clusters_samples = []
    valid_clusters_labels = []
    cnt = 0
    for ulabel in unique_labels:
        if ulabel != -1:
            ulabel_indx = np.where(labels == ulabel)
            #print(ulabel_indx)
            #print(f'ulabel {ulabel} samples-{len(ulabel_indx)}')
            if len(ulabel_indx[0]) >= 10:
                data_X = np.array(data_X)
                #print(len(ulabel_indx[0]))
                ulabel_samples = data_X[ulabel_indx]

                for ulabelsample in ulabel_samples:
                    valid_clusters_samples.append(ulabelsample)
                    valid_clusters_labels.append(cnt)

                cnt += 1
        else:
            pass

    valid_clusters_samples = np.array(valid_clusters_samples)
    valid_clusters_labels = np.array(valid_clusters_labels)

    print(f'valid clusters {cnt} among {len(unique_labels)}')
    print(f'{len(valid_clusters_labels) == len(valid_clusters_samples)}')



    samples_indx = list(range(len(valid_clusters_labels)))
    random.shuffle(samples_indx)

    train_size = int(len(samples_indx)*0.9)
    trainset = samples_indx[:train_size]
    testset = samples_indx[train_size:]

    # Separate the training set
    X_tr = valid_clusters_samples[np.array(trainset)]
    Y_tr = valid_clusters_labels[np.array(trainset)]

    # Separate the test set
    X_te = valid_clusters_samples[np.array(testset)]
    Y_te = valid_clusters_labels[np.array(testset)]


    return X_tr, Y_tr, X_te, Y_te

In [None]:
#X_tr, Y_tr, X_te, Y_te = get_HDBSCAN_train_test(0, 6, task_families_data)

In [None]:
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import transforms, utils, datasets
from torch.utils.data import Dataset, DataLoader

from ember_utils import *
from ember_model import *
from ember_pjr_utils import *



patience = 10
num_exps = 1
num_epoch = 500
batch_size = 2


exp_type = replay_type = 'testHDBSCAN-Quality'
exp_seeds = [random.randint(1, 99999) for i in range(1)]
exp = exp_seeds[0]


start_time = time.time()
use_cuda = True
print('Torch', torch.__version__, 'CUDA', torch.version.cuda)
use_cuda = use_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
torch.manual_seed(exp)

model = Ember_MLP_Net()

optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.000001)

if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)

model = model.to(device)
print(f'Model has {count_parameters(model)/1000000}m parameters')    
criterion = nn.BCELoss()    

family_ind = 6

for task_month in range(len(all_task_months)):

    
    task_start = time.time()

    current_task = all_task_months[task_month]
    task_months = all_task_months[:task_month+1]
    
    model_save_dir = '../HDBSCAN_Quality' +\
                    str(exp_type) + '/HDBSCAN_Quality' + str(current_task) + '/'
    create_parent_folder(model_save_dir)

    opt_save_path = '../HDBSCAN_Quality' +\
                str(exp_type) + '/HDBSCAN_Quality' + str(current_task) + '/'
    create_parent_folder(opt_save_path)


    results_save_dir = './HDBSCAN_Quality' +\
                str(exp_type) + '/HDBSCAN_Quality' + '/' 
    create_parent_folder(results_save_dir)    
    
    
    
    
    X_train, Y_train, X_test, Y_test = get_HDBSCAN_train_test(task_month, family_ind, malware_family_samples)

    print()
    print(f'X_train {X_train.shape} Y_train {Y_train.shape}')
    print()

    task_training_time, epoch_ran, training_loss, validation_loss  =\
                            training_early_stopping(model, model_save_dir, opt_save_path,\
                            X_train, Y_train, X_test, Y_test, patience,\
                            batch_size, device, optimizer, num_epoch,\
                             criterion, replay_type, current_task, exp, earlystopping=True)



    model = Ember_MLP_Net()
    model = model.to(device)
    #load the best model for this task
    best_model_path = model_save_dir + os.listdir(model_save_dir)[0]
    print(f'loading best model {best_model_path}')
    model.load_state_dict(torch.load(best_model_path))


    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.000001)
    best_optimizer = opt_save_path + os.listdir(opt_save_path)[0]
    print(f'loading best optimizer {best_optimizer}')
    optimizer.load_state_dict(torch.load(best_optimizer))


    acc, rocauc = testing_aucscore(model, X_test, Y_test, batch_size, device)


    end_time = time.time()

    print(f'Elapsed time {(end_time - start_time)/60} mins.')    


end_time = time.time()
cnt += 1
print(f'Elapsed time {(end_time - start_time)/60} mins.')