In [1]:
import os
import json

# The timestamp we used to split our training and testing samples 
# according to their first seen timestamps on VirusTotal
SPLIT_TIMESTAMP = 1522540800 

# whether to use the small sample datasets included in the repository
# set this to False to use the full datasets that we will share upon requests
use_sample_datasets = True

`<DatasetName>_sandbox_metadata.json` and ``<DatasetName>_endpoint_metadata.json`` files include the metadata for each hash (sample) collected for our datasets. Let's load them from the json files included in this repository (after extracting the `.zip` files).

In [2]:
# These files are included in the Train Data

if use_sample_datasets:
    train_data_path = 'MalwareITW_TrainData_Sample' # small train metadata, included in the repository
else:
    train_data_path = 'MalwareITW_TrainData' # full train metadata, shared when requested

with open(os.path.join(train_data_path, 'train_sandbox_metadata.json'), 'r') as fp:
    train_sandbox_metadata = json.load(fp)

with open(os.path.join(train_data_path, 'train_endpoint_metadata.json'), 'r') as fp:
    train_endpoint_metadata = json.load(fp)

# These files are included in the TrainAndTest Data

if use_sample_datasets:
    full_data_path = 'MalwareITW_TrainAndTestData_Sample' # small TrainAndTest metadata, included in the repository
else:
    full_data_path = 'MalwareITW_TrainAndTestData' # full TrainAndTest metadata, shared when requested

with open(os.path.join(full_data_path, 'TrainAndTest_sandbox_metadata.json'), 'r') as fp:
    full_sandbox_metadata = json.load(fp)

with open(os.path.join(full_data_path, 'TrainAndTest_endpoint_metadata.json'), 'r') as fp:
    full_endpoint_metadata = json.load(fp)

First, let's see the metadata in the **Train Data** for the `Habo` sandbox sample with hash: `9deb78f23cb5a7876992af03ca43acf5b9d9f000c22d62310df265e6ad4945ba`

In [3]:
train_sandbox_metadata['9deb78f23cb5a7876992af03ca43acf5b9d9f000c22d62310df265e6ad4945ba']

{'first_seen': 1518344430,
 'publisher': 'Paul Mattes',
 'vhash': '0160866d1c0d1c051505105016z1c9z5bz1fz',
 'tlsh': 'T1E0853353E0B240BAE2B2D93D1C3A96245A237D6279B596183F8C9EDE1F33743190F356',
 'ssdeep': '24576:nnaVefuUQs5z3+x36czfY76eoPVCEX6BqR7YkfU3ZLvkdt9vkxlgMZUlYWh7Z0xx:na0iMY6d7E6BqRYZLMwgMZUea7s+HQN5',
 'dataset_name': ['sorel18'],
 'old': {'label': 0,
  'fam': 'BENIGN',
  'scan_date': 1518344430,
  'num_detections': -1,
  'label_source': 'sorel18'},
 'file_names': ['wc3270-3.6ga5-setup.exe',
  'managedway.dl.sourceforge.net',
  'downloads.sourceforge.net',
  'astuteinternet.dl.sourceforge.net',
  '9DEB78F23CB5A7876992AF03CA43ACF5B9D9F000C22D62310DF265E6AD4945BA.exe']}

* `first_seen`: Epoch timestamp that corresponds to the first seen date of the sample on VirusTotal
* `publisher`: The publisher information for the sample, extracted from the VirusTotal report. 
* `vhash`: The vhash hash of the binary file, included in its VirusTotal report.
 * `tlsh`: The tlsh hash of the binary file, included in its VirusTotal report.
 * `ssdeep`: The ssdeep hash of the binary file, included in its VirusTotal report.
* `dataset_name`: The source dataset of this sample. In our work, we merge multiple datasets, including *SOREL*, *EMBER* and VirusTotal Malware Folder (*vt17* and *vt18*). The same sample can be seen in multiple sources. The dataset name *ep* corresponds to the samples in our endpoint dataset.
* `file_names`: This is the list of file names this sample had in its submissions to VirusTotal.

There is also a sub-dictionary in the Train Data metadata with the key `old`:

This sub-dictionary contains the label information from a VirusTotal report that is older than the sample's `first_seen` timestamp. These older reports are not available for all samples, in which case, we used the most recent report to populate this sub-dictionary.

The keys in this sub-dictionary:

* `label`: This is the ground truth label we assigned to this sample (e.g., if number of detections is over 5, the sample is labeled as malware). Label 0,1 and 2 are for benign, malware and PUP samples, respectively.
* `family`: This is the malware family tag of the sample, assigned using AVClass2. This tag is *BENIGN* if the sample is benign or *UNKNOWN* if the family tag is unavailable.
* `scan_date`: The timestamp of the VirusTotal report used to label this sample.
* `num_detection`: This is the number of AV engines on VirusTotal that detected this sample as malware, if it is -1, our labeling source was not VirusTotal.
* `label_source`: This indicates where our older VirusTotal report came from. For example, *latest_copied* means that we didn't have an older detection report for this sample and used the latest report.

Now let's look at the metadata of a sample in our endpoint dataset with hash `b6a5ca3c796ddc03b63d233a89095dbc655bc1fe4d1a1e9a520901656ece918b`. 

This record has the same structure as sandbox metadata records. All samples in this metadata file will include *ep* in the `dataset_name` list.

In [4]:
train_endpoint_metadata['b6a5ca3c796ddc03b63d233a89095dbc655bc1fe4d1a1e9a520901656ece918b']

{'first_seen': 1269478350,
 'publisher': 'NO SIGNATURE',
 'vhash': '044046151d155dzd=z',
 'tlsh': 'T15803728B36E7C666ED890B755E9AD6886517BC02DD10460B3ABC3F8FD9B52C24C44EC3',
 'ssdeep': '384:wnM4A2o6PJhDex6wl6sSeTO3/JF1a/wiS40RAC:wPThxexd3K3UYiRkAC',
 'dataset_name': ['ep'],
 'old': {'fam': 'BENIGN',
  'label': 0,
  'scan_date': 1654318924,
  'num_detections': 1,
  'label_source': 'latest_copied'},
 'file_names': ['perl.exe',
  '0a65acffd253a575c2aae13c523a7d99',
  'perl5.8.3.exe']}

If you would like to collect the binary labels and families for all our endpoint hashes, you can do as follows:

P.S. The small endpoint metadata included in this repository only contains three samples, request access for the full metadata.

In [5]:
from collections import Counter

endpoint_train_hashes = [h for h, v in train_endpoint_metadata.items()]
labels = []
families = []
publishers = []

for h in endpoint_train_hashes:
    metadata = train_endpoint_metadata[h]
    labels.append(metadata['old']['label'])
    families.append(metadata['old']['fam'])
    publishers.append(metadata['publisher'])

print('Label Counts in our Endpoint Samples Based on Train Data metadata:')
print(Counter(labels))

print('\nFamily Counts in our Endpoint Samples Based on Train Data metadata:')
print(Counter(families))

print('\nPublisher Counts in our Endpoint Samples Based on Train Data metadata:')
print(Counter(publishers))

Label Counts in our Endpoint Samples Based on Train Data metadata:
Counter({1: 1, 2: 1, 0: 1})

Family Counts in our Endpoint Samples Based on Train Data metadata:
Counter({'alien': 1, 'loadmoney': 1, 'BENIGN': 1})

Publisher Counts in our Endpoint Samples Based on Train Data metadata:
Counter({'NO SIGNATURE': 2, 'OOO YULIYA': 1})


So far, we looked at the records included in the **Train Data**. 

Now, let's look at a record included in the **Full Data** for the same endpoint sample with hash `b6a5ca3c796ddc03b63d233a89095dbc655bc1fe4d1a1e9a520901656ece918b`:

In [6]:
full_endpoint_metadata['b6a5ca3c796ddc03b63d233a89095dbc655bc1fe4d1a1e9a520901656ece918b']

{'first_seen': 1269478350,
 'publisher': 'NO SIGNATURE',
 'vhash': '044046151d155dzd=z',
 'tlsh': 'T15803728B36E7C666ED890B755E9AD6886517BC02DD10460B3ABC3F8FD9B52C24C44EC3',
 'ssdeep': '384:wnM4A2o6PJhDex6wl6sSeTO3/JF1a/wiS40RAC:wPThxexd3K3UYiRkAC',
 'dataset_name': ['ep'],
 'new': {'fam': 'BENIGN',
  'label': 0,
  'scan_date': 1654318924,
  'num_detections': 1,
  'label_source': 'latest_copied'},
 'old': {'fam': 'BENIGN',
  'label': 0,
  'scan_date': 1654318924,
  'num_detections': 1,
  'label_source': 'latest_copied'},
 'file_names': ['perl.exe',
  '0a65acffd253a575c2aae13c523a7d99',
  'perl5.8.3.exe']}

As you can see, the **Full Data** metadata records include an additional sub-dictionary with the key `new`.

This sub-dictionary contains the label information from the most recent VirusTotal report (collected in 2022).
The structure of the `new` sub-dictionary is the same as the `old` sub-dictionary.

The **Full Data** metadata files also includes samples that are seen after the `SPLIT_TIMESTAMP`. These samples correspond to the testing samples in our work.

In [7]:
# the metadata in the Train Data doesn't include any sample first seen after <SPLIT_TIMESTAMP>

print(f'Total #samples in the train_sandbox_metadata: {len(train_sandbox_metadata)}')
print(f'Total #samples in the train_endpoint_metadata: {len(train_endpoint_metadata)}')

sb_test_samples_in_train = [h for h in train_sandbox_metadata if train_sandbox_metadata[h]['first_seen'] > SPLIT_TIMESTAMP]
ep_test_samples_in_train = [h for h in train_endpoint_metadata if train_endpoint_metadata[h]['first_seen'] > SPLIT_TIMESTAMP]

print(f'Total #samples in the train_sandbox_metadata seen after <SPLIT_TIMESTAMP>: {len(sb_test_samples_in_train)}')
print(f'Total #samples in the train_endpoint_metadata seen after <SPLIT_TIMESTAMP>: {len(sb_test_samples_in_train)}')


print('\n==================\n')

# the metadata in the Full Data also includes samples first seen after <SPLIT_TIMESTAMP> (testing samples)

print(f'Total #samples in the full_sandbox_metadata: {len(full_sandbox_metadata)}')
print(f'Total #samples in the full_endpoint_metadata: {len(full_endpoint_metadata)}')

sb_test_samples_in_full = [h for h in full_sandbox_metadata if full_sandbox_metadata[h]['first_seen'] > SPLIT_TIMESTAMP]
ep_test_samples_in_full = [h for h in full_endpoint_metadata if full_endpoint_metadata[h]['first_seen'] > SPLIT_TIMESTAMP]

print(f'Total #samples in the full_sandbox_metadata seen after <SPLIT_TIMESTAMP>: {len(sb_test_samples_in_full)}')
print(f'Total #samples in the full_endpoint_metadata seen after <SPLIT_TIMESTAMP>: {len(ep_test_samples_in_full)}')

Total #samples in the train_sandbox_metadata: 30
Total #samples in the train_endpoint_metadata: 3
Total #samples in the train_sandbox_metadata seen after <SPLIT_TIMESTAMP>: 0
Total #samples in the train_endpoint_metadata seen after <SPLIT_TIMESTAMP>: 0


Total #samples in the full_sandbox_metadata: 60
Total #samples in the full_endpoint_metadata: 6
Total #samples in the full_sandbox_metadata seen after <SPLIT_TIMESTAMP>: 30
Total #samples in the full_endpoint_metadata seen after <SPLIT_TIMESTAMP>: 3
