-
Notifications
You must be signed in to change notification settings - Fork 10
/
model
executable file
·129 lines (124 loc) · 9.12 KB
/
model
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
from pbox import *
from tinyscript import *
__script__ = "Model manipulation tool"
__version__ = "1.3.7"
__contributors__ = [
{'author': "Sébastien Martinez", 'reason': "extended ML algorithms, metrics and visualizations to unsupervised "
"learning"},
]
__doc__ = """
This utility aims to train Machine Learning models based on an input dataset. It currently supports various algorithms
and allows to select a data scaler. Moreover, it supports Grid Search cross-validation for some algorithms.
"""
__description__ = "Train Machine Learning models"
__examples__ = [
"list",
"preprocess [model_name] my-dataset",
"train my-dataset -A mnb",
"show [model_name]",
]
if __name__ == '__main__':
sparsers = parser.add_subparsers(dest="command", metavar="CMD", title="positional argument",
help="command to be executed")
browse = sparsers.add_parser("browse", category="read", help="browse input data (including predictions) based on "
"the selected criteria")
add_argument(browse, "mdname", "executable", nargs="?")
browse.add_argument("-q", "--query", help="query for filtering records to be selected",
note="see <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html>")
browse.add_argument("--no-feature", action="store_true", help="do not show computed features")
compare = add_argument(sparsers.add_parser("compare", category="read", help="compare the selected model with "
"others"), "mdname")
compare.add_argument("-d", "--dataset", type=Dataset.load, action="extend", nargs="*",
help="dataset to be selected for the comparison")
compare.add_argument("-i", "--include", action="store_true", help="include unformatted models")
compare.add_argument("-m", "--model", type=Model.load, action="extend", nargs="*",
help="model to be added in the comparison")
edit = sparsers.add_parser("edit", category="create/modify/delete", help="edit the performance log file")
add_argument(edit, "mdname", force=True)
listm = sparsers.add_parser("list", category="read", help="list all the models from the workspace")
listm.add_argument("--algorithms", action="store_true", help="show available algorithms instead of models")
preproc = sparsers.add_parser("preprocess", category="read", help="preprocess features and visualize the result")
add_argument(preproc, "mdname")
add_argument(preproc, "executable", nargs="?", note="similar to 'dataset browse', except that it requires a model "
"to apply its preprocessors in addition")
add_argument(preproc, "features-set")
add_argument(preproc, "mi-select")
add_argument(preproc, "mi-kbest")
add_argument(sparsers.add_parser("purge", category="create/modify/delete",
help="purge the selected model"), "mdname", force=True)
rename = add_argument(sparsers.add_parser("rename", category="create/modify/delete",
help="rename the selected model"), "mdname")
rename.add_argument("name2", type=ts.folder_does_not_exist, help="new name of the model")
add_argument(sparsers.add_parser("show", category="read", help="get an overview of the model"), "mdname")
test = sparsers.add_parser("test", category="read", help="test the model on a given input")
add_argument(test, "mdname", "executable", "features-set", "ignore-labels")
test.add_argument("--sep", default=",", choices=",;|\t", help="set the CSV separator",
note="required when using input CSV data instead of a Dataset (no effect otherwise)")
add_argument(test, "true-class")
train = sparsers.add_parser("train", category="create/modify/delete", help="train a model on the given dataset")
train.add_argument("dataset", type=Dataset.load, help="dataset for training the model")
train.add_argument("-a", "--algorithms-set", metavar="YAML", type=ts.file_exists, default=config['algorithms'],
help="algorithms set's YAML definition")
train.add_argument("-A", "--algorithm", choices=[a.name for a in Algorithm.registry], default="dt", metavar="ALGO",
help="machine learning algorithm to be used\n- %s\n * supports Grid Search cross-validation\n" %\
"\n- ".join("%s: %s%s" % (a.name.ljust(8), a.description, ["", "*"][a.parameters.get('cv') \
is not None]) for a in Algorithm.registry))
add_argument(train, "features-set")
tgroup = train.add_mutually_exclusive_group()
tgroup.add_argument("-m", "--multiclass", action="store_true", help="train the model using multiple label classes")
add_argument(tgroup, "true-class")
train.add_argument("-r", "--reset", action="store_true", help="reset the model before (re)training")
train.add_argument("--cv", default=5, type=ts.pos_int, help="number of Cross-Validation folds")
add_argument(train, "ignore-labels")
train.add_argument("--n-jobs", type=lambda x: ts.pos_int(x, False), help="number of jobs to be run in parallel")
train.add_argument("-w", "--wrapper-select", action="store_true", help="use recursive feature elimination with CV "
"wrapper method to select optimal features before model training")
train.add_argument("-p", "--param", nargs="*", help="comma-separated list of parameters for the algorithm",
note="fixing a parameter with this option disables it from the cross-validation")
train.add_argument("-W", "--wrapper-param", nargs="*",type=lambda x: x.split(","), help="comma-separated list of "
"parameters for the wrapper feature selection algorithm")
add_argument(train, "mi-select")
add_argument(train, "mi-kbest")
visualize = add_argument(sparsers.add_parser("visualize", category="read", help="visualize the model"), "mdname")
visualize.add_argument("-e", "--export", action="store_true", help="export to PNG")
visualize.add_argument("-o", "--output-dir", metavar="DIR", default=".", help="output directory")
viz_dependent = visualize.add_argument_group("options depending on the chosen algorithm", before="extra arguments")
viz_dependent.add_argument("--imputer-strategy", default="mean", choices=("constant", "mean", "most_frequent"),
help="strategy for imputing missing values")
add_argument(viz_dependent, "reduction-algorithm")
viz_dependent.add_argument("-d", "--distance-threshold", default=15, type=int,
help="distance threshold for dendrograms colors")
viz_dependent.add_argument("-f", "--features", nargs="*",type=lambda x: x.split(","),
help="comma separated list of features to be selected", default="")
viz_dependent.add_argument("-i", "--invert-cluster-colors", action="store_true", help="invert cluster colors")
viz_dependent.add_argument("-l", "--hierarchy-levels", metavar="H", default=4, type=int,
help="number of levels to display when truncating dendrogram")
add_argument(viz_dependent, "ncomponents", "perplexity")
viz_dependent.add_argument("-r", "--reduce-train-data", action="store_true", help="reduce the training data",
note="the data is only reduced for the visualization by default")
viz_dependent.add_argument("-t", "--plot-labels", action="store_true", help="plot true labels of packers")
viz_dependent.add_argument("--horizontal", action="store_true",
help="display subplots horizontally instead of vertically")
viz_dependent.add_argument("--multiclass", action="store_true", help="plot multiclass true labels of packers")
viz_dependent.add_argument("--plot-extensions", action="store_true", help="plot the file extensions")
viz_dependent.add_argument("--plot-formats", action="store_true", help="plot the file formats")
viz_dependent.add_argument('--range', type=float, nargs=4,
help='select range values to plot in format: x_min x_max y_min y_max')
initialize(noargs_action="usage")
configure_logging(args.verbose)
# prepare parsed arguments
args.load = getattr(args, "name", None) is not None and args.command not in ["list", "purge"]
set_yaml(args)
# now execute
if args.command == "train":
args.param = expand_parameters(*(args.param or []))
if args.command == "visualize":
args.viz_params = {}
for a in ["distance_threshold", "features", "hierarchy_levels", "horizontal", "imputer_strategy",
"invert_cluster_colors", "multiclass", "n_components", "perplexity", "plot_extensions",
"plot_formats", "plot_labels", "range", "reduce_train_data", "reduction_algorithm"]:
args.viz_params[a] = getattr(args, a)
delattr(args, a)
getattr(Model(**vars(args)), args.command)(**vars(args))