In [None]:
import os
import json

# The timestamp we used to split our training and testing samples 
# according to their first seen timestamps on VirusTotal
SPLIT_TIMESTAMP = 1522540800 

# whether to use the small sample datasets included in the repository
# set this to False to use the full datasets that we will share upon requests
use_sample_datasets = True

`<DatasetName>_sandbox_metadata.json` and ``<DatasetName>_endpoint_metadata.json`` files include the metadata for each hash (sample) collected for our datasets. Let's load them from the json files included in this repository (after extracting the `.zip` files).

In [2]:
# These files are included in the Train Data

if use_sample_datasets:
    train_data_path = 'MalwareITW_TrainData_Sample' # small train metadata, included in the repository
else:
    train_data_path = 'MalwareITW_TrainData' # full train metadata, shared when requested

with open(os.path.join(train_data_path, 'Train_sandbox_metadata.json'), 'r') as fp:
    train_sandbox_metadata = json.load(fp)

with open(os.path.join(train_data_path, 'Train_endpoint_metadata.json'), 'r') as fp:
    train_endpoint_metadata = json.load(fp)

# These files are included in the TrainAndTest Data

if use_sample_datasets:
    traintest_data_path = 'MalwareITW_TrainAndTestData_Sample' # small TrainAndTest metadata, included in the repository
else:
    traintest_data_path = 'MalwareITW_TrainAndTestData' # full TrainAndTest metadata, shared when requested

with open(os.path.join(traintest_data_path, 'TrainAndTest_sandbox_metadata.json'), 'r') as fp:
    traintest_sandbox_metadata = json.load(fp)

with open(os.path.join(traintest_data_path, 'TrainAndTest_endpoint_metadata.json'), 'r') as fp:
    traintest_endpoint_metadata = json.load(fp)

First, let's see the metadata in the **Train Data** for the `Habo` sandbox sample with hash: `b3ac33c3156668b71d77f041ea48ad97fe96959f5ea1bc7e7e0695b29ab623d4`

In [3]:
train_sandbox_metadata['b3ac33c3156668b71d77f041ea48ad97fe96959f5ea1bc7e7e0695b29ab623d4']

{'first_seen': 1519065442,
 'publisher': 'Cloud Installer',
 'vhash': '016056655d1555616ze006f7zf0c5z30400411z6bz',
 'tlsh': 'T1DF3517316AC18031D3123331CE14EEEE356A6DB40DDA955FE2A43B394BB41B2DD3B65A',
 'ssdeep': '12288:vsM+aTA3c+FK1vrlVYBVignBtZnfVq4cz1i5pP9kPQK:UV4W8hqBYgnBLfVqx1Wjk3',
 'dataset_name': ['ember'],
 'old': {'label': 2,
  'scan_date': 1519065442,
  'num_detections': -1,
  'label_source': 'ember',
  'fam': 'GENERIC_MAL'},
 'file_names': ['IESettings',
  'myfile.exe',
  'VirusShare_e15a3a7731e9fb379ec15fce60466399']}

* `first_seen`: Epoch timestamp that corresponds to the first seen date of the sample on VirusTotal
* `publisher`: The publisher information for the sample, extracted from the VirusTotal report. 
* `vhash`: The vhash hash of the binary file, included in its VirusTotal report.
 * `tlsh`: The tlsh hash of the binary file, included in its VirusTotal report.
 * `ssdeep`: The ssdeep hash of the binary file, included in its VirusTotal report.
* `dataset_name`: The source dataset of this sample. In our work, we merge multiple datasets, including *SOREL*, *EMBER* and VirusTotal Malware Folder (*vt17* and *vt18*). The same sample can be seen in multiple sources. The dataset name *ep* corresponds to the samples in our endpoint dataset.
* `file_names`: This is the list of file names this sample had in its submissions to VirusTotal.

There is also a sub-dictionary in the Train Data metadata with the key `old`:

This sub-dictionary contains the label information from a VirusTotal report that is older than the sample's `first_seen` timestamp. These older reports are not available for all samples, in which case, we used the most recent report to populate this sub-dictionary.

The keys in this sub-dictionary:

* `label`: This is the ground truth label we assigned to this sample (e.g., if number of detections is over 5, the sample is labeled as malware). Label 0,1 and 2 are for benign, malware and PUP samples, respectively.
* `family`: This is the malware family tag of the sample, assigned using AVClass2. This tag is *BENIGN* if the sample is benign or *UNKNOWN* if the family tag is unavailable.
* `scan_date`: The timestamp of the VirusTotal report used to label this sample.
* `num_detection`: This is the number of AV engines on VirusTotal that detected this sample as malware, if it is -1, our labeling source was not VirusTotal.
* `label_source`: This indicates where our older VirusTotal report came from. For example, *latest_copied* means that we didn't have an older detection report for this sample and used the latest report.

Now let's look at the metadata of a sample in our endpoint dataset with hash `f51a39a1735e1f1dd9d5c7cea3bd56a8dc4ba6f0b03747455778f73f0d78409a`. 

This record has the same structure as sandbox metadata records. All samples in this metadata file will include *ep* in the `dataset_name` list.

In [4]:
train_endpoint_metadata['f51a39a1735e1f1dd9d5c7cea3bd56a8dc4ba6f0b03747455778f73f0d78409a']

{'first_seen': 1521386004,
 'publisher': 'TOV Dveri Fado',
 'vhash': '095046651d751az3dnz95z17z',
 'tlsh': 'none',
 'ssdeep': '12288:bv2Jtp8DMW4chwIM+5he99cJFwOijTJCqbtFW1RY6NoEL/UD:GpHW5g9cJFIN6Rt5DUD',
 'dataset_name': ['ep'],
 'old': {'fam': 'cnopa',
  'label': 1,
  'scan_date': 1600864279,
  'num_detections': 59,
  'label_source': 'latest_copied'},
 'file_names': ['8e1d.tmp', 'ComDev.exe', 'nvfontcache.exe']}

If you would like to collect the binary labels and families for all our endpoint hashes, you can do as follows:

P.S. The small endpoint metadata included in this repository only contains three samples, request access for the full metadata.

In [5]:
from collections import Counter

endpoint_train_hashes = [h for h, v in train_endpoint_metadata.items()]
labels = []
families = []
publishers = []

for h in endpoint_train_hashes:
    metadata = train_endpoint_metadata[h]
    labels.append(metadata['old']['label'])
    families.append(metadata['old']['fam'])
    publishers.append(metadata['publisher'])

print('Label Counts in our Endpoint Samples Based on Train Data metadata:')
print(Counter(labels))

print('\nFamily Counts in our Endpoint Samples Based on Train Data metadata:')
print(Counter(families))

print('\nPublisher Counts in our Endpoint Samples Based on Train Data metadata:')
print(Counter(publishers))

Label Counts in our Endpoint Samples Based on Train Data metadata:
Counter({2: 1, 0: 1, 1: 1})

Family Counts in our Endpoint Samples Based on Train Data metadata:
Counter({'installcore': 1, 'BENIGN': 1, 'cnopa': 1})

Publisher Counts in our Endpoint Samples Based on Train Data metadata:
Counter({'NO SIGNATURE': 2, 'TOV Dveri Fado': 1})


So far, we looked at the records included in the **Train Data**. 

Now, let's look at a record included in the **TrainAndTest Data** for the same endpoint sample with hash `f51a39a1735e1f1dd9d5c7cea3bd56a8dc4ba6f0b03747455778f73f0d78409a`:

In [6]:
traintest_endpoint_metadata['f51a39a1735e1f1dd9d5c7cea3bd56a8dc4ba6f0b03747455778f73f0d78409a']

{'first_seen': 1521386004,
 'publisher': 'TOV Dveri Fado',
 'vhash': '095046651d751az3dnz95z17z',
 'tlsh': 'none',
 'ssdeep': '12288:bv2Jtp8DMW4chwIM+5he99cJFwOijTJCqbtFW1RY6NoEL/UD:GpHW5g9cJFIN6Rt5DUD',
 'dataset_name': ['ep'],
 'new': {'fam': 'cnopa',
  'label': 1,
  'scan_date': 1600864279,
  'num_detections': 59,
  'label_source': 'latest_copied'},
 'old': {'fam': 'cnopa',
  'label': 1,
  'scan_date': 1600864279,
  'num_detections': 59,
  'label_source': 'latest_copied'},
 'file_names': ['8e1d.tmp', 'ComDev.exe', 'nvfontcache.exe']}

As you can see, the **TrainAndTest Data** metadata records include an additional sub-dictionary with the key `new`.

This sub-dictionary contains the label information from the most recent VirusTotal report (collected in 2022).
The structure of the `new` sub-dictionary is the same as the `old` sub-dictionary.

The **TrainAndTest Data** metadata files also includes samples that are seen after the `SPLIT_TIMESTAMP`. These samples correspond to the testing samples in our work.

In [None]:
# the metadata in the Train Data doesn't include any sample first seen after <SPLIT_TIMESTAMP>

print(f'Total #samples in the train_sandbox_metadata: {len(train_sandbox_metadata)}')
print(f'Total #samples in the train_endpoint_metadata: {len(train_endpoint_metadata)}')

sb_test_samples_in_train = [h for h in train_sandbox_metadata if train_sandbox_metadata[h]['first_seen'] > SPLIT_TIMESTAMP]
ep_test_samples_in_train = [h for h in train_endpoint_metadata if train_endpoint_metadata[h]['first_seen'] > SPLIT_TIMESTAMP]

print(f'Total #samples in the train_sandbox_metadata seen after <SPLIT_TIMESTAMP>: {len(sb_test_samples_in_train)}')
print(f'Total #samples in the train_endpoint_metadata seen after <SPLIT_TIMESTAMP>: {len(sb_test_samples_in_train)}')


print('\n==================\n')

# the metadata in the TrainAndTest Data also includes samples first seen after <SPLIT_TIMESTAMP> (testing samples)

print(f'Total #samples in the traintest_sandbox_metadata: {len(traintest_sandbox_metadata)}')
print(f'Total #samples in the traintest_endpoint_metadata: {len(traintest_endpoint_metadata)}')

sb_test_samples_in_full = [h for h in traintest_sandbox_metadata if traintest_sandbox_metadata[h]['first_seen'] > SPLIT_TIMESTAMP]
ep_test_samples_in_full = [h for h in traintest_endpoint_metadata if traintest_endpoint_metadata[h]['first_seen'] > SPLIT_TIMESTAMP]

print(f'Total #samples in the traintest_sandbox_metadata seen after <SPLIT_TIMESTAMP>: {len(sb_test_samples_in_full)}')
print(f'Total #samples in the traintest_endpoint_metadata seen after <SPLIT_TIMESTAMP>: {len(ep_test_samples_in_full)}')

Total #samples in the train_sandbox_metadata: 30
Total #samples in the train_endpoint_metadata: 3
Total #samples in the train_sandbox_metadata seen after <SPLIT_TIMESTAMP>: 0
Total #samples in the train_endpoint_metadata seen after <SPLIT_TIMESTAMP>: 0


Total #samples in the traintest_sandbox_metadata: 60
Total #samples in the traintest_endpoint_metadata: 6
Total #samples in the traintest_sandbox_metadata seen after <SPLIT_TIMESTAMP>: 30
Total #samples in the traintest_endpoint_metadata seen after <SPLIT_TIMESTAMP>: 3
