In [None]:
import os
import numpy as np
import tqdm
import json
import random
import time
import multiprocessing
from ember_features import PEFeatureExtractor

def vectorize(irow, raw_features_string, X_path, y_path, extractor, nrows):
    """
    Vectorize a single sample of raw features and write to a large numpy file
    """
    raw_features = json.loads(raw_features_string)
    
    feature_vector = extractor.process_raw_features(raw_features)

    y = np.memmap(y_path, dtype=np.float32, mode="r+", shape=nrows)
    y[irow] = raw_features["label"]
    

    X = np.memmap(X_path, dtype=np.float32, mode="r+", shape=(nrows, extractor.dim))
    X[irow] = feature_vector


def vectorize_unpack(args):
    """
    Pass through function for unpacking vectorize arguments
    """
    return vectorize(*args)



def create_parent_folder(file_path):
    if not os.path.exists(os.path.dirname(file_path)):
        os.makedirs(os.path.dirname(file_path))
        
def raw_feature_iterator(file_paths, task_months):
    """
    Yield raw feature strings from the inputed file paths
    """
    for path in file_paths:
        with open(path, "r") as fin:
            for line in fin:
                raw_features = json.loads(line)
                if raw_features['appeared'] in task_months:
                    yield line


def task_based_vectorize_subset(X_path, y_path, raw_feature_paths, task_months, extractor, nrows):
    """
    Vectorize a subset of data and write it to disk
    """
    # Create space on disk to write features to
    X = np.memmap(X_path, dtype=np.float32, mode="w+", shape=(nrows, extractor.dim))
    y = np.memmap(y_path, dtype=np.float32, mode="w+", shape=nrows)

    del X, y

    # Distribute the vectorization work
    pool = multiprocessing.Pool()
    argument_iterator = ((irow, raw_features_string, X_path, y_path, extractor, nrows)
                         for irow, raw_features_string in enumerate(raw_feature_iterator(raw_feature_paths, task_months)))
    #print(argument_iterator)
    
    
    for _ in tqdm.tqdm(pool.imap_unordered(vectorize_unpack, argument_iterator), total=nrows):
        pass
    
    #return argument_iterator

        
def task_num_rows(raw_feature_paths, task_months):
    cnt_rows = 0
    
    family_labels = []
    
    for fp in raw_feature_paths:
        #print(fp)
        with open(fp, "r") as fin:
            print(fp)
            for line in fin:
                raw_features = json.loads(line)
                if raw_features['appeared'] in task_months:
                    try:
                        family_labels.append(raw_features['avclass'])
                    except: 
                        family_labels.append('missing_avclass')
                    cnt_rows += 1
    return cnt_rows, family_labels


def create_task_based_vectorized_features(data_dir, save_dir, current_task, task_months, feature_version=2):
    """
    Create feature vectors from raw features and write them to disk
    """
    extractor = PEFeatureExtractor(feature_version)
    
    print(f'Vectorizing {current_task} task data')
    X_path = os.path.join(save_dir, "X_train.dat")
    y_path = os.path.join(save_dir, "y_train.dat")
    
    #y_path_family_labels = os.path.join(save_dir, "y_family_train.dat")
    
    raw_feature_paths_base_tr = [os.path.join(data_dir, "train_features_{}.jsonl".format(i)) for i in range(6)]
    raw_feature_paths_base_te = [os.path.join(data_dir, "test_features.jsonl")]
    raw_feature_paths = raw_feature_paths_base_tr + raw_feature_paths_base_te
    
    nrows, family_labels = task_num_rows(raw_feature_paths, current_task)
    #print(nrows)
    
    save_test_file = save_dir + 'task_family_labels.npz'
    np.savez(save_test_file, family_labels=family_labels)
    
    
    task_based_vectorize_subset(X_path, y_path, raw_feature_paths, current_task, extractor, nrows)
    #argument_iterator = task_based_vectorize_subset(X_path, y_path, raw_feature_paths, task_months, extractor, nrows)
    
    #return argument_iterator

def read_task_based_vectorized_features(save_dir, feature_version=2):
    """
    Read vectorized features into memory mapped numpy arrays
    """

    extractor = PEFeatureExtractor(feature_version)
    ndim = extractor.dim
    X_ = None
    y_ = None


    X_path = os.path.join(save_dir, "X_train.dat")
    y_path = os.path.join(save_dir, "y_train.dat")
    
    y_ = np.memmap(y_path, dtype=np.float32, mode="r")
    N = y_.shape[0]
    
    X_ = np.memmap(X_path, dtype=np.float32, mode="r", shape=(N, ndim))
    
    print(np.unique(y_))
    
    goodware_indices = []
    malware_indices = []
    
    
    for ind, i in enumerate(y_):
        if i == 0:
            goodware_indices.append(ind)
        elif i == 1:
            malware_indices.append(ind)
        else:
            pass
    
    malware_goodware_indices = goodware_indices + malware_indices
    
    print(len(y_[malware_goodware_indices]), len(y_[goodware_indices]), len(y_[malware_indices]))
    
    
    Y_family_labels_file = save_dir + 'task_family_labels.npz'
    Y_fam_labels_ = np.load(Y_family_labels_file)
    Y_fam_labels = Y_fam_labels_['family_labels']
    
    
    X = X_[malware_goodware_indices]
    Y = y_[malware_goodware_indices]
    Y_families = Y_fam_labels[malware_goodware_indices]
    
    indx = [i for i in range(len(Y))]
    random.shuffle(indx)

    train_size = int(len(indx)*0.9)
    trainset = indx[:train_size]
    testset = indx[train_size:]

    # Separate the training set
    X_train = X[trainset]
    Y_train = Y[trainset]
    Y_family_train = Y_families[trainset]

    # Separate the test set
    X_test = X[testset]
    Y_test = Y[testset]
    Y_family_test = Y_families[testset]
    
    
    print(f'X_train {X_train.shape} Y_train {Y_train.shape} Y_family_train {Y_family_train.shape}\n X_test {X_test.shape} Y_test {Y_test.shape} \n Y_family_test {Y_family_test.shape}')
    
    print(f'saving files ...')
    save_training_file = save_dir + 'XY_train.npz'
    save_test_file = save_dir + 'XY_test.npz'
    
    np.savez(save_training_file, X_train=X_train, Y_train=Y_train, Y_family_train = Y_family_train)
    np.savez(save_test_file, X_test=X_test, Y_test=Y_test, Y_family_test = Y_family_test)

    
    
    
    
all_task_months = ['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06',
                   '2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12']

data_dir = "../../../ember2017/ember_2017_2/"



for task in range(0,len(all_task_months)):
    start_time = time.time()
    #task = 5 + task
    current_task = all_task_months[task]
    task_months = all_task_months[:task+1]
    
    
    save_dir = '../../../ember2017/EMBER2017withFamilyLabels/' + str(current_task) + '/'
    create_parent_folder(save_dir)
    
    print(f'Processing data for task {current_task}')
    #print(current_task, task_months)
    create_task_based_vectorized_features(data_dir, save_dir, current_task, task_months, feature_version=2)
    read_task_based_vectorized_features(save_dir, feature_version=2)
    
    
    end_time = time.time()
    
    print(f'Elapsed time {(end_time - start_time)/60} mins.')    

Processing data for task 2017-01
Vectorizing 2017-01 task data
../../../ember2017/ember_2017_2/train_features_0.jsonl
../../../ember2017/ember_2017_2/train_features_1.jsonl
../../../ember2017/ember_2017_2/train_features_2.jsonl
../../../ember2017/ember_2017_2/train_features_3.jsonl
../../../ember2017/ember_2017_2/train_features_4.jsonl
../../../ember2017/ember_2017_2/train_features_5.jsonl
../../../ember2017/ember_2017_2/test_features.jsonl


100%|██████████| 78547/78547 [02:50<00:00, 461.58it/s] 


[-1.  0.  1.]
49941 17180 32761
X_train (44946, 2381) Y_train (44946,) Y_family_train (44946,)
 X_test (4995, 2381) Y_test (4995,) 
 Y_family_test (4995,)
saving files ...
Elapsed time 5.449914828936259 mins.
Processing data for task 2017-02
Vectorizing 2017-02 task data
../../../ember2017/ember_2017_2/train_features_0.jsonl
../../../ember2017/ember_2017_2/train_features_1.jsonl
../../../ember2017/ember_2017_2/train_features_2.jsonl
../../../ember2017/ember_2017_2/train_features_3.jsonl
../../../ember2017/ember_2017_2/train_features_4.jsonl
../../../ember2017/ember_2017_2/train_features_5.jsonl
../../../ember2017/ember_2017_2/test_features.jsonl


100%|██████████| 91453/91453 [02:54<00:00, 523.48it/s] 


[-1.  0.  1.]
60059 32820 27239
X_train (54053, 2381) Y_train (54053,) Y_family_train (54053,)
 X_test (6006, 2381) Y_test (6006,) 
 Y_family_test (6006,)
saving files ...
Elapsed time 5.53721563021342 mins.
Processing data for task 2017-03
Vectorizing 2017-03 task data
../../../ember2017/ember_2017_2/train_features_0.jsonl
../../../ember2017/ember_2017_2/train_features_1.jsonl
../../../ember2017/ember_2017_2/train_features_2.jsonl
../../../ember2017/ember_2017_2/train_features_3.jsonl
../../../ember2017/ember_2017_2/train_features_4.jsonl
../../../ember2017/ember_2017_2/train_features_5.jsonl
../../../ember2017/ember_2017_2/test_features.jsonl


100%|██████████| 53609/53609 [02:47<00:00, 319.41it/s] 


[-1.  0.  1.]
37953 25261 12692
X_train (34157, 2381) Y_train (34157,) Y_family_train (34157,)
 X_test (3796, 2381) Y_test (3796,) 
 Y_family_test (3796,)
saving files ...
Elapsed time 5.417266853650411 mins.
Processing data for task 2017-04
Vectorizing 2017-04 task data
../../../ember2017/ember_2017_2/train_features_0.jsonl
../../../ember2017/ember_2017_2/train_features_1.jsonl
../../../ember2017/ember_2017_2/train_features_2.jsonl
../../../ember2017/ember_2017_2/train_features_3.jsonl
../../../ember2017/ember_2017_2/train_features_4.jsonl
../../../ember2017/ember_2017_2/train_features_5.jsonl
../../../ember2017/ember_2017_2/test_features.jsonl


100%|██████████| 116391/116391 [03:07<00:00, 621.39it/s] 


[-1.  0.  1.]
72047 24739 47308
X_train (64842, 2381) Y_train (64842,) Y_family_train (64842,)
 X_test (7205, 2381) Y_test (7205,) 
 Y_family_test (7205,)
saving files ...
Elapsed time 5.776256624857584 mins.
Processing data for task 2017-05
Vectorizing 2017-05 task data
../../../ember2017/ember_2017_2/train_features_0.jsonl
../../../ember2017/ember_2017_2/train_features_1.jsonl
../../../ember2017/ember_2017_2/train_features_2.jsonl
../../../ember2017/ember_2017_2/train_features_3.jsonl
../../../ember2017/ember_2017_2/train_features_4.jsonl
../../../ember2017/ember_2017_2/train_features_5.jsonl
../../../ember2017/ember_2017_2/test_features.jsonl


100%|██████████| 116341/116341 [03:13<00:00, 601.85it/s] 


[-1.  0.  1.]
76652 29313 47339
X_train (68986, 2381) Y_train (68986,) Y_family_train (68986,)
 X_test (7666, 2381) Y_test (7666,) 
 Y_family_test (7666,)
saving files ...
Elapsed time 5.9401707688967385 mins.
Processing data for task 2017-06
Vectorizing 2017-06 task data
../../../ember2017/ember_2017_2/train_features_0.jsonl
../../../ember2017/ember_2017_2/train_features_1.jsonl
../../../ember2017/ember_2017_2/train_features_2.jsonl
../../../ember2017/ember_2017_2/train_features_3.jsonl
../../../ember2017/ember_2017_2/train_features_4.jsonl
../../../ember2017/ember_2017_2/train_features_5.jsonl
../../../ember2017/ember_2017_2/test_features.jsonl


100%|██████████| 53659/53659 [02:57<00:00, 301.67it/s] 


[-1.  0.  1.]
33348 20687 12661
X_train (30013, 2381) Y_train (30013,) Y_family_train (30013,)
 X_test (3335, 2381) Y_test (3335,) 
 Y_family_test (3335,)
saving files ...
Elapsed time 5.726492114861807 mins.
Processing data for task 2017-07
Vectorizing 2017-07 task data
../../../ember2017/ember_2017_2/train_features_0.jsonl
../../../ember2017/ember_2017_2/train_features_1.jsonl
../../../ember2017/ember_2017_2/train_features_2.jsonl
../../../ember2017/ember_2017_2/train_features_3.jsonl
../../../ember2017/ember_2017_2/train_features_4.jsonl
../../../ember2017/ember_2017_2/train_features_5.jsonl
../../../ember2017/ember_2017_2/test_features.jsonl


100%|██████████| 91963/91963 [03:11<00:00, 481.45it/s] 


[-1.  0.  1.]
59286 29785 29501
X_train (53357, 2381) Y_train (53357,) Y_family_train (53357,)
 X_test (5929, 2381) Y_test (5929,) 
 Y_family_test (5929,)
saving files ...
Elapsed time 5.9300231575965885 mins.
Processing data for task 2017-08
Vectorizing 2017-08 task data
../../../ember2017/ember_2017_2/train_features_0.jsonl
../../../ember2017/ember_2017_2/train_features_1.jsonl
../../../ember2017/ember_2017_2/train_features_2.jsonl
../../../ember2017/ember_2017_2/train_features_3.jsonl
../../../ember2017/ember_2017_2/train_features_4.jsonl
../../../ember2017/ember_2017_2/train_features_5.jsonl
../../../ember2017/ember_2017_2/test_features.jsonl


100%|██████████| 78037/78037 [03:08<00:00, 414.52it/s] 


[-1.  0.  1.]
50714 20215 30499
X_train (45642, 2381) Y_train (45642,) Y_family_train (45642,)
 X_test (5072, 2381) Y_test (5072,) 
 Y_family_test (5072,)
saving files ...
Elapsed time 5.9505980730056764 mins.
Processing data for task 2017-09
Vectorizing 2017-09 task data
../../../ember2017/ember_2017_2/train_features_0.jsonl
../../../ember2017/ember_2017_2/train_features_1.jsonl
../../../ember2017/ember_2017_2/train_features_2.jsonl
../../../ember2017/ember_2017_2/train_features_3.jsonl
../../../ember2017/ember_2017_2/train_features_4.jsonl
../../../ember2017/ember_2017_2/train_features_5.jsonl
../../../ember2017/ember_2017_2/test_features.jsonl


100%|██████████| 83245/83245 [03:40<00:00, 377.42it/s] 


[-1.  0.  1.]
51422 20561 30861
X_train (46279, 2381) Y_train (46279,) Y_family_train (46279,)
 X_test (5143, 2381) Y_test (5143,) 
 Y_family_test (5143,)
saving files ...
Elapsed time 6.715438036123912 mins.
Processing data for task 2017-10
Vectorizing 2017-10 task data
../../../ember2017/ember_2017_2/train_features_0.jsonl
../../../ember2017/ember_2017_2/train_features_1.jsonl
../../../ember2017/ember_2017_2/train_features_2.jsonl
../../../ember2017/ember_2017_2/train_features_3.jsonl
../../../ember2017/ember_2017_2/train_features_4.jsonl
../../../ember2017/ember_2017_2/train_features_5.jsonl
../../../ember2017/ember_2017_2/test_features.jsonl


100%|██████████| 86755/86755 [03:41<00:00, 391.08it/s] 


[-1.  0.  1.]
58578 29439 29139
X_train (52720, 2381) Y_train (52720,) Y_family_train (52720,)
 X_test (5858, 2381) Y_test (5858,) 
 Y_family_test (5858,)
saving files ...
Elapsed time 6.985703674952189 mins.
Processing data for task 2017-11
Vectorizing 2017-11 task data
../../../ember2017/ember_2017_2/train_features_0.jsonl
../../../ember2017/ember_2017_2/train_features_1.jsonl
../../../ember2017/ember_2017_2/train_features_2.jsonl
../../../ember2017/ember_2017_2/train_features_3.jsonl
../../../ember2017/ember_2017_2/train_features_4.jsonl
../../../ember2017/ember_2017_2/train_features_5.jsonl
../../../ember2017/ember_2017_2/test_features.jsonl


100%|██████████| 112085/112085 [03:51<00:00, 483.32it/s] 


[0. 1.]
112085 60449 51636
X_train (100876, 2381) Y_train (100876,) Y_family_train (100876,)
 X_test (11209, 2381) Y_test (11209,) 
 Y_family_test (11209,)
saving files ...
Elapsed time 7.68086051940918 mins.
Processing data for task 2017-12
Vectorizing 2017-12 task data
../../../ember2017/ember_2017_2/train_features_0.jsonl
../../../ember2017/ember_2017_2/train_features_1.jsonl
../../../ember2017/ember_2017_2/train_features_2.jsonl
../../../ember2017/ember_2017_2/train_features_3.jsonl
../../../ember2017/ember_2017_2/train_features_4.jsonl
../../../ember2017/ember_2017_2/train_features_5.jsonl
../../../ember2017/ember_2017_2/test_features.jsonl


 36%|███▌      | 31279/87915 [03:42<00:19, 2872.54it/s]