# Packware dataset analysis and selection

In [1]:
import numpy as np
import pandas as pd

df = pd.read_pickle("path/to/wildlab.pickle")

In [2]:
# Number of elements in the lab dataset
df = df.loc[df['source'].isin(['lab-v3'])]

In [3]:
df['source'].value_counts()

lab-v3    290721
Name: source, dtype: int64

To verify that the lab dataset is effectively composed by all packed samples we count also values of the df filtered by packed field

In [4]:
# Number of elements in the lab dataset that are packed)
df[df['packed'] == True]['source'].value_counts()

lab-v3    290721
Name: source, dtype: int64

In [5]:
# Let's filter the dataframe to the columns we need
df = df[['unpacked_sample_sha1','benign','malicious', 'packer_name']]

In [6]:
df['packer_name'].value_counts()

obsidium      48432
themida-v2    42803
petite        39495
telock        35284
pecompact     33956
upx           30558
mpress        22535
kkrunchy      22305
pelock        15353
Name: packer_name, dtype: int64

In [7]:
pd.DataFrame({'benign' : df.groupby(['packer_name'])['benign'].sum(), 'malicious' : df.groupby(['packer_name'])['malicious'].sum()})

Unnamed: 0_level_0,benign,malicious
packer_name,Unnamed: 1_level_1,Unnamed: 2_level_1
kkrunchy,6811,15494
mpress,11041,11494
obsidium,16940,31492
pecompact,5610,28346
pelock,6879,8474
petite,13638,25857
telock,5235,30049
themida-v2,15895,26908
upx,9938,20620


Let's now filter the dataset with all samples contained in the dataframe.

In [None]:
from os import walk, remove, path
from collections import defaultdict

DATASET_PATH = ""
DONE = True
packed_files = [(dirpath, filenames) for (dirpath, dirnames, filenames) in walk(DATASET_PATH)]

if not DONE:
    packers_count = defaultdict(int)

    for i in range(len(packed_files)):
        packer_name = packed_files[i][0].split("/")[-1]
        packer_df = df[df['packer_name'] == packer_name]
        sha_list = packer_df['unpacked_sample_sha1'].values

        print("filtering " + str(packer_name) + "...")
        for j in range(len(packed_files[i][1])):
            file_name = packed_files[i][1][j]
            file_sha1 = (file_name.split("_")[1]).split(".")[0]
            if file_sha1 in sha_list:
                packers_count[packer_name] += 1
            else:
                if path.exists(DATASET_PATH + packer_name + "/" + file_name):
                    remove(DATASET_PATH + packer_name + "/" + file_name)
                else:
                    print("The file does not exist")

            

Let's now perform an undersampling on the dataset

In [9]:
# Let's use the same seed they used in "packers classification" task in the paper for reproducibility
SEED = 17

indices = []
packers = df.packer_name.unique()
cur_min = 100000
for p in packers:
    if p == 'none':
        continue
    dp = df[df.packer_name == p]
    cur_min = min(cur_min, len(dp))

for p in packers:
    if p == 'none':
        continue
    dp = df[df.packer_name == p]
    indices.extend(list(dp.sample(cur_min, random_state=SEED).index))

df_balanced = df[df.index.isin(indices)]

In [10]:
# To verify if we have effectively the same number of samples for each packer
df_balanced['packer_name'].value_counts()

themida-v2    15353
obsidium      15353
upx           15353
kkrunchy      15353
telock        15353
petite        15353
mpress        15353
pecompact     15353
pelock        15353
Name: packer_name, dtype: int64

In [11]:
len(df_balanced)

138177

In [12]:
pd.DataFrame({'benign' : df_balanced.groupby(['packer_name'])['benign'].sum(), 'malicious' : df_balanced.groupby(['packer_name'])['malicious'].sum()})

Unnamed: 0_level_0,benign,malicious
packer_name,Unnamed: 1_level_1,Unnamed: 2_level_1
kkrunchy,4694,10659
mpress,7518,7835
obsidium,5337,10016
pecompact,2468,12885
pelock,6879,8474
petite,5267,10086
telock,2217,13136
themida-v2,5656,9697
upx,5016,10337


Let's now select only the 10% of the dataset in order to speed up the results extraction.

In [13]:
df_balanced_sampled = df_balanced.groupby('packer_name', group_keys=False).apply(lambda x: x.sample(frac=0.1, random_state=SEED))

In [14]:
df_balanced_sampled['packer_name'].value_counts()

kkrunchy      1535
mpress        1535
obsidium      1535
pecompact     1535
pelock        1535
petite        1535
telock        1535
themida-v2    1535
upx           1535
Name: packer_name, dtype: int64

In [15]:
pd.DataFrame({'benign' : df_balanced_sampled.groupby(['packer_name'])['benign'].sum(), 'malicious' : df_balanced_sampled.groupby(['packer_name'])['malicious'].sum()})

Unnamed: 0_level_0,benign,malicious
packer_name,Unnamed: 1_level_1,Unnamed: 2_level_1
kkrunchy,487,1048
mpress,760,775
obsidium,529,1006
pecompact,238,1297
pelock,716,819
petite,509,1026
telock,205,1330
themida-v2,564,971
upx,501,1034
