-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathdataset
executable file
·223 lines (218 loc) · 15 KB
/
dataset
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
from pbox import *
from tinyscript import *
__script__ = "Dataset manipulation tool"
__version__ = "2.6.8"
__contributors__ = [
{'author': "Romain Jennes", 'reason': "added dataset alterations and adversarial learning capabilities"},
]
__doc__ = """
This tool aims to manipulate a dataset in many ways including its creation, enlargement, update, alteration, selection,
merging, export or purge.
"""
__description__ = "Make datasets of packed and not packed executables for use with Machine Learning"
__examples__ = [
"alter my-dataset --packed-only",
"alter my-dataset --percentage 30",
"alter my-dataset --query \"label == '-'\"",
"browse my-dataset --query \"label == 'upx'\"",
"convert my-dataset --new-name my-fileless-dataset",
"export my-dataset --format packed-samples --output my-dataset-files -n 100",
"fix my-dataset --detect",
"fix my-dataset --labels my-labels.json",
"ingest dataset-packed-elf --detect",
"ingest dataset-packed-pe --labels /path/to/exe/labels.json --merge --rename as-is",
"list --show-all",
"make my-dataset -f ELF -n 500 --source-dir /path/to/dotnet/exe",
"make my-dataset -f PE,ELF -n 2000",
"merge my-dataset dataset-to-be-merged",
"plot characteristic my-dataset label --perplexity 50",
"plot features my-dataset byte_0_after_ep byte_1_after_ep",
"plot features my-dataset number_resources --format png",
"plot labels my-dataset",
"preprocess my-dataset --preprocessor Std",
"purge all",
"purge test-*",
"remove my-dataset --query \"label == 'aspack'\"",
"select my-dataset my-new-dataset --query \"label == 'upx'\"",
"show my-dataset --per-format --limit 20",
"update my-dataset --detect --refresh -s /path1/to/exe -s /path2/to/exe",
"update my-dataset -f PE32 --source-dir /path/to/exe --labels /path/to/exe/labels.json",
"view my-dataset --query \"ctime > 2020\"",
"view my-dataset -q \"'PE32' in format\"",
]
__example_limit__ = {'main': 1}
if __name__ == '__main__':
pp_help = "preprocessor for shaping the data\n- %s\n" % \
"\n- ".join("%s: %s" % (k.ljust(10), "none" if v is None else v.__name__ \
if not isinstance(v, tuple) else "%s with %s" % \
(v[0].__name__, ", ".join("{}={}".format(*i) for i in v[1].items()))) \
for k, v in sorted(PREPROCESSORS.items(), key=lambda x: x[0] or "none") if k is not None)
cmds = parser.add_subparsers(dest="command", metavar="CMD", title="positional argument",
help="command to be executed")
alter = add_argument(cmds.add_parser("alter", category="create/modify/delete",
help="alter the target dataset with a set of transformations"), "dsname")
add_argument(alter, "alterations-set", "alteration")
alter.add_argument("-n", "--new-name", help="name for the new altered dataset",
note="if None, the original dataset is altered")
agroup = alter.add_mutually_exclusive_group()
agroup.add_argument("-p", "--percentage", type=percentage, help="percentage of samples to be altered")
add_argument(agroup, "query", default=None)
agroup.add_argument("--packed-only", action="store_true", help="alter packed samples only")
browse = add_argument(cmds.add_parser("browse", category="read",
help="compute features and browse the resulting data"), "dsname")
bgroup = browse.add_mutually_exclusive_group()
add_argument(bgroup, "features-set")
bgroup.add_argument("-n", "--no-feature", action="store_true", help="do not compute features")
add_argument(browse, "n-jobs")
add_argument(browse, "query")
convert = add_argument(cmds.add_parser("convert", category="create/modify/delete",
help="convert the target dataset to a fileless one"), "dsname")
add_argument(convert, "features-set")
convert.add_argument("-n", "--new-name", help="name for the new converted dataset",
note="if None, the original dataset is overwritten")
add_argument(convert, "n-jobs")
add_argument(cmds.add_parser("edit", category="create/modify/delete", help="edit the data file"), "dsname")
export = add_argument(cmds.add_parser("export", category="create/modify/delete", help="export packed samples from a"
" dataset or export the dataset to a given format"), "dsname")
export.add_argument("-f", "--format", default="dsff", choices=("arff", "csv", "db", "dsff", "packed-samples"),
help="output format")
add_argument(export, "n-jobs")
export.add_argument("-o", "--output", default="export", metavar="F", type=ts.folder_does_not_exist,
help="output folder or file for the export", note="the extension gets added automatically")
export_ = export.add_argument_group("option when exporting packed samples", before="extra arguments")
export_.add_argument("-n", "--number-samples", dest="n", type=ts.pos_int, default=0,
help="number of packed samples to be exported")
fix = add_argument(cmds.add_parser("fix", category="create/modify/delete", help="fix a corrupted dataset"),
"dsname")
add_argument(fix, "detect", "labels")
ingest = cmds.add_parser("ingest", category="create/modify/delete",
help="ingest samples from a folder into new dataset(s)")
ingest.add_argument("folder", type=ts.folder_exists, help="target folder with subfolders containing samples")
add_argument(ingest, "detect", "labels")
ingest.add_argument("--merge", action="store_true", help="merge all subfolders into a single dataset")
ingest.add_argument("-m", "--min-samples", type=ts.pos_int, default=100,
help="minimum of samples to be found in subfolders for being kept")
ingest.add_argument("-M", "--max-samples", type=ts.pos_int, default=0,
help="maximum of samples to be found in subfolders for being kept")
ingest.add_argument("--overwrite", action="store_true", help="overwrite if dataset already exists")
ingest.add_argument("-p", "--prefix", default="", help="prefix to be added to the name(s) of the new dataset(s)")
ingest.add_argument("-r", "--rename", default="slugify", choices=tuple(RENAME_FUNCTIONS.keys()),
help="apply a function for naming the new dataset(s)")
ingest.add_argument("-x", "--exclude", nargs="*", help="folder to be excluded")
listds = cmds.add_parser("list", category="read", help="list all the datasets from the workspace")
listds.add_argument("-a", "--show-all", action="store_true", help="show all datasets even those that are corrupted")
listds.add_argument("-f", "--hide-files", action="store_true", help="hide the 'files' column")
listds.add_argument("-r", "--raw", action="store_true", help="display unformatted text", note="useful with grep")
make = add_argument(cmds.add_parser("make", category="create/modify/delete", help="add n randomly chosen "
"executables from input sources to the dataset"), "dsname", force=True)
make_ = make.add_mutually_exclusive_group()
make_.add_argument("-a", "--pack-all", action="store_true", help="pack all executables",
note="this cannot be set with -b/--balance")
make_.add_argument("-b", "--balance", action="store_true", help="balance the dataset relatively to the number of "
"packers used, not between packed and not packed")
make.add_argument("-f", "--formats", type=ts.values_list, default="All", help="list of formats to be considered")
make.add_argument("-n", "--number", dest="n", type=ts.pos_int, default=100,
help="number of executables for the output dataset")
make.add_argument("-p", "--packer", action="extend", nargs="*", type=lambda p: Packer.get(p),
help="packer to be used")
make.add_argument("-s", "--source-dir", action="extend", nargs="*", type=lambda p: ts.Path(p, expand=True),
help="executables source directory to be included")
merge = cmds.add_parser("merge", category="create/modify/delete", help="merge two datasets")
add_argument(merge, "dsname")
add_argument(merge, "dsname", argname="name2", help="name of the dataset to merge")
merge.add_argument("-n", "--new-name", help="name for the new merged dataset",
note="if None, the original dataset is overwritten")
add_argument(merge, "n-jobs")
plot = cmds.add_parser("plot", category="read", help="plot something about the dataset")
pcmds = plot.add_subparsers(dest="subcommand", help="command to be executed")
plot_char = pcmds.add_parser("characteristic", help="plot reduced data highlighting a characteristic")
add_argument(plot_char, "dsname")
plot_char.add_argument("characteristic", type=characteristic_identifier, help="characteristic/feature identifier",
note="by 'characteristic', it is meant either executable metadata or feature")
add_argument(plot_char, "reduction-algorithm", "binary", "ncomponents", "perplexity", "title")
plot_feat = pcmds.add_parser("features", help="distribution of one or multiple features (bar chart)")
add_argument(plot_feat, "dsname", "feature")
plot_feat_ = plot_feat.add_mutually_exclusive_group()
add_argument(plot_feat_, "multiclass")
plot_feat.add_argument("-n", "--num-values", default=0, type=ts.pos_int, note="0 means all",
help="limit number of most occurring distict feature values in plot")
add_argument(plot_feat_, "true-class")
add_argument(plot_feat, "title")
figure_options(plot_feat)
plot_featcomp = pcmds.add_parser("features-compare", help="compare feature difference with other datasets")
add_argument(plot_featcomp, "dsname", "feature", "aggregate", "datasets", "max-features", "title",
max_feats_with="largest difference")
figure_options(plot_featcomp)
plot_ig = pcmds.add_parser("infogain", help="sorted distribution of information gains for the selected features "
"(bar chart)")
add_argument(plot_ig, "dsname", "feature", "multiclass", "max-features", "title",
max_feats_with="highest information gain")
figure_options(plot_ig)
plot_igcomp = pcmds.add_parser("infogain-compare", help="compare information gain difference between this dataset "
"and others")
add_argument(plot_igcomp, "dsname", "feature", "aggregate", "datasets", "multiclass", "max-features", "title",
max_feats_with="highest information gain")
figure_options(plot_igcomp)
plot_lab = pcmds.add_parser("labels", help="distribution of labels in the dataset (pie chart)")
add_argument(plot_lab, "dsname", "title")
figure_options(plot_lab)
plot_samp = pcmds.add_parser("samples", help="plot each executable from the target dataset")
add_argument(plot_samp, "dsname")
figure_options(plot_samp)
plot_samp.add_argument("-n", type=ts.pos_int, help="plot only N samples")
add_argument(plot_samp, "query", "title")
preproc = cmds.add_parser("preprocess", category="read", help="preprocess the input dataset given preprocessors")
add_argument(preproc, "dsname")
add_argument(preproc, "n-jobs")
preproc.add_argument("-p", "--preprocessor", action="extend", nargs="*", choices=PREPROCESSORS.keys(), help=pp_help)
add_argument(preproc, "query")
purge = cmds.add_parser("purge", category="create/modify/delete", help="purge a dataset")
add_argument(purge, "dsname", force=True, note="use 'all' to purge all datasets or '*' to select a part of them")
purge.add_argument("-b", "--backup", action="store_true", help="only purge backups")
remove = add_argument(cmds.add_parser("remove", category="create/modify/delete",
help="remove executables from a dataset"), "dsname")
add_argument(remove, "query")
rename = cmds.add_parser("rename", category="create/modify/delete", help="rename a dataset")
add_argument(rename, "dsname", "dsname2")
revert = add_argument(cmds.add_parser("revert", category="create/modify/delete",
help="revert a dataset to its previous state"), "dsname")
revert.add_argument("-i", "--init", action="store_true", help="revert up to initial state",
note="this removes all backups")
select = cmds.add_parser("select", category="create/modify/delete", help="select a subset of the dataset")
add_argument(select, "dsname", "dsname2", "number", "query")
select.add_argument("-s", "--split", action="store_true",
help="split original dataset by removing the selected samples from the original")
show = add_argument(cmds.add_parser("show", category="read", help="get an overview of the dataset"), "dsname")
show.add_argument("-l", "--limit", default=10, type=int, help="number of executables to be displayed per format")
show.add_argument("--per-format", action="store_true", help="display statistics per format")
update = cmds.add_parser("update", category="create/modify/delete", help="update a dataset with new executables")
add_argument(update, "dsname", force=True)
add_argument(update, "detect")
update.add_argument("-f", "--formats", type=ts.values_list, default="All", help="list of formats to be considered")
add_argument(update, "labels")
add_argument(update, "number", dest="n")
update.add_argument("-r", "--refresh", action="store_true", help="refresh labels of already existing executables")
update.add_argument("-s", "--source-dir", action="extend", nargs="*", type=lambda p: ts.Path(p, expand=True),
help="executables source directory to be included")
view = cmds.add_parser("view", category="read", help="view executables filtered from a dataset")
add_argument(view, "dsname", "query")
initialize(noargs_action="usage")
configure_logging(args.verbose)
if args.command == "plot" and args.subcommand == "features" and len(args.feature) == 0:
logger.warning("No feature selected")
sys.exit(0)
# prepare parsed arguments
args.load = getattr(args, "name", None) is not None and args.command not in ["ingest", "list", "purge"]
set_yaml(args)
# now execute
if args.command == "ingest":
args.rename_func = RENAME_FUNCTIONS[args.rename]
ds = Dataset(**vars(args))
getattr(ds, args.command)(**vars(args))
# it may occur that packing fails with a silenced error and that the related executable file remains in the
# files subfolder of the dataset while not handled in data.csv, hence creating an inconsistency ;
# fixing the dataset right after the make command allows to avoid this inconsistency
if args.command == "make":
ds.fix()