-
Notifications
You must be signed in to change notification settings - Fork 0
/
optimisec.py
executable file
·294 lines (252 loc) · 9.96 KB
/
optimisec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
#!/usr/bin/env python
'''
Optimise the liblinear penalty parameter using cross-validation.
Author: Pontus Stenetorp <pontus stenetorp se>
Version: 2012-03-26
'''
from argparse import ArgumentParser
from atexit import register as atexit_register
from collections import defaultdict, namedtuple
from itertools import izip_longest
from math import fsum, log
from os import remove
from random import sample
from shutil import move
from sys import stderr
from tempfile import NamedTemporaryFile
from lib.fixedseed import FixedSeed
### Constants
F_SCORE_LBL = 'f-score'
ACC_SCORE_LBL = 'accuracy'
ARGPARSER = ArgumentParser()#XXX:
ARGPARSER.add_argument('data')
ARGPARSER.add_argument('model')
ARGPARSER.add_argument('-f', '--folds', type=int, default=10)
ARGPARSER.add_argument('-j', '--jobs', type=int, default=1)
ARGPARSER.add_argument('-l', '--liblinear-train-cmd', default='train')
ARGPARSER.add_argument('-s', '--seed', type=int, default=0x5648765a)
ARGPARSER.add_argument('-v', '--verbose', action='store_true')
ARGPARSER.add_argument('-c', '--print-c', action='store_true')
ARGPARSER.add_argument('-p', '--optimisation',
choices=(ACC_SCORE_LBL, F_SCORE_LBL, ), default=F_SCORE_LBL)
# TODO: C ranges
# TODO: Model types for liblinear
NEG_LBL = 1
POS_LBL = 2
###
Model = namedtuple('Model', ('c', 'score', ))
def _f_score(gold_path, pred_path):
tp = 0
tn = 0
fp = 0
fn = 0
with open(gold_path, 'r') as gold:
with open(pred_path, 'r') as pred:
for gold_val, pred_val in izip_longest(
(int(l.split(' ')[0]) for l in gold),
(int(l.rstrip('\n')) for l in pred)):
assert gold_val is not None, 'gold shorter than pred data'
assert pred_val is not None, 'pred shorter than gold data'
# XXX: Not sure if these assumptions hold outside of eepura
if gold_val == POS_LBL and pred_val == POS_LBL:
tp += 1
elif gold_val == POS_LBL and pred_val == NEG_LBL:
fn += 1
elif gold_val == NEG_LBL and pred_val == POS_LBL:
fp += 1
elif gold_val == NEG_LBL and pred_val == NEG_LBL:
tn += 1
else:
assert False, ('unknown label values, f-score only supports '
'binary classification')
try:
p = float(tp) / (tp + fp)
except ZeroDivisionError:
p = 0.0
try:
r = float(tp) / (tp + fn)
except ZeroDivisionError:
r = 0.0
try:
f = (2 * p * r) / (p + r)
except ZeroDivisionError:
f = 0.0
return f
def _accuracy_score(gold_path, pred_path):
tp = 0
fp = 0
with open(gold_path, 'r') as gold:
with open(pred_path, 'r') as pred:
for gold_val, pred_val in izip_longest(
(int(l.split(' ')[0]) for l in gold),
(int(l.rstrip('\n')) for l in pred)):
assert gold_val is not None, 'gold shorter than pred data'
assert pred_val is not None, 'pred shorter than gold data'
if gold_val == pred_val:
tp += 1
else:
fp += 1
return tp / float(tp + fp)
SCORE_F_BY_LBL = {
F_SCORE_LBL: _f_score,
ACC_SCORE_LBL: _accuracy_score,
}
def _fold_to_idx_mapping(data_len, k=10, seed=0x510f62ce):
idx_pool = set(xrange(data_len + 1))
idx_to_fold = {}
with FixedSeed(seed):
for fold in xrange(1, k + 1):
if fold != k:
fold_idxs = set(sample(idx_pool,
len(idx_pool) / (k + 1 - fold)))
idx_pool = idx_pool - fold_idxs
else:
# Assign all remaining to this fold
fold_idxs = idx_pool
for fold_idx in fold_idxs:
idx_to_fold[fold_idx] = fold
return idx_to_fold
from subprocess import Popen
from shlex import split as shlex_split
from os.path import dirname
# XXX: TODO: Purge all bash usage!
def _train_model(c, train_path, model_path):
train_cmd = ("'{0}/ext/liblinear/train"
" -q -c {1} {2} {3}'").format(dirname(__file__), c, train_path,
model_path)
#print train_cmd
train_p = Popen(
shlex_split(train_cmd),
shell=True, executable='/bin/bash') # XXX: Nasty bash path
train_p.wait()
def _eval_fold_and_c(fold_path, train_paths, c,
optimisation_target=F_SCORE_LBL):
train_path = None
model_path = None
pred_path = None
try:
# XXX: Hack for now, liblinear doesn't like bash hacks, we could
# really assemble these train files elsewhere instead
with NamedTemporaryFile('w', delete=False) as train_file:
for train_path in train_paths:
with open(train_path, 'r') as train:
for line in train:
train_file.write(line)
train_path = train_file.name
with NamedTemporaryFile('w', delete=False) as model_file:
model_path = model_file.name
_train_model(c, train_path, model_path)
with NamedTemporaryFile('w', delete=False) as pred_file:
pred_path = pred_file.name
pred_cmd = ("'{0}/ext/liblinear/predict"
" {1} {2} {3}'").format(dirname(__file__), fold_path,
model_path, pred_path)
#print pred_cmd
#raw_input()
# liblinear won't shut-up...
with open('/dev/null', 'w') as dev_null:
pred_p = Popen(
shlex_split(pred_cmd),
shell=True, executable='/bin/bash', stdout=dev_null) # XXX: Nasty bash path
pred_p.wait()
score = SCORE_F_BY_LBL[optimisation_target](fold_path, pred_path)
#print c, score
#raw_input()
return Model(c=c, score=score)
finally:
for path in (train_path, pred_path, model_path, ):
if path is not None:
remove(path)
def __eval_fold_and_c(args):
return _eval_fold_and_c(*args)
def _find_optimal_model(data_path, folds=10, seed=0xc0236b36, pool=None,
verbose=False, optimisation_target=F_SCORE_LBL):
# Segment the data into folds
# TODO: Could be a function
fold_to_fold_fh = {}
for fold in xrange(1, folds + 1):
fold_to_fold_fh[fold] = NamedTemporaryFile('w', delete=False)
@atexit_register
def _fold_files_cleanup():
for fh in fold_to_fold_fh.itervalues():
remove(fh.name)
with open(data_path, 'r') as data:
for idx, _ in enumerate(_ for _ in data):
pass
try:
data_len = idx
except UnboundLocalError:
assert False, 'no training data found'
idx_to_fold = _fold_to_idx_mapping(data_len, k=folds, seed=seed)
with open(data_path, 'r') as data:
for idx, line in enumerate(data):
fold_to_fold_fh[idx_to_fold[idx]].write(line)
# Close all file-handles so that they are flushed to disc
for fh in fold_to_fold_fh.itervalues():
fh.close()
fold_paths = [fh.name for fh in fold_to_fold_fh.itervalues()]
# XXX: Hard-coded C;s
#c_values = [2 ** c_pow for c_pow in xrange(17, -7, -2)]
# tweaked after a bit of a look at curves for the "bow",
# "comp" and "david" models.
c_values = [2 ** c_pow for c_pow in xrange(5, -7, -1)]
# Evaluate all C-values to find the optimal model
def _eval_args():
# Order motivated due to high c;s being likely to take longer, we thus
# want to ensure that they are started as early as possible
for c_value in c_values:
for fold_path in fold_paths:
curr_fold = fold_path
curr_train = [p for p in fold_paths if p != curr_fold]
yield (curr_fold, curr_train, c_value, optimisation_target)
if pool is not None:
eval_results = pool.imap_unordered(__eval_fold_and_c, _eval_args())
else:
eval_results = (__eval_fold_and_c(a) for a in _eval_args())
c_to_models = defaultdict(list)
for model in eval_results:
#print model
c_to_models[model.c].append(model)
# Summarise the models for each fold into a single "model"
avg_models = []
for c, models in c_to_models.iteritems():
avg_score = fsum(m.score for m in models) / len(models)
avg_models.append(Model(c=c, score=avg_score))
if verbose:
avg_models.sort(key=lambda x : x.c)
for model in avg_models:
print >> stderr, '{0}\t{1}'.format(model.c, model.score)
avg_models.sort(key=lambda x : x.score)
optimal_c = avg_models[-1].c
if verbose:
print >> stderr, 'Optimal C:', optimal_c
# Small paranoid check
if optimal_c == c_values[0] or optimal_c == c_values[-1]:
print >> stderr, ('WARNING: Optimal C value {0} found on the boundary '
'of tested values [{1}], we could potentially have missed '
'the optimum').format(optimal_c,
', '.join(str(c) for c in c_values[::-1]))
return optimal_c
def train_optimal_model(model_path, data_path, folds=10, seed=0x994c00d8,
pool=None, verbose=False, optimisation_target=F_SCORE_LBL,
print_c=False):
optimal_c = _find_optimal_model(data_path, folds=folds, seed=seed,
pool=pool, verbose=verbose,
optimisation_target=optimisation_target)
if print_c:
print '{}'.format(int(log(optimal_c, 2)))
_train_model(optimal_c, data_path, model_path)
def main(args):
argp = ARGPARSER.parse_args(args[1:])
if argp.jobs > 1:
from multiprocessing import Pool
pool = Pool(argp.jobs)
else:
pool = None
train_optimal_model(argp.model, argp.data, folds=argp.folds,
seed=argp.seed, pool=pool, verbose=argp.verbose,
optimisation_target=argp.optimisation, print_c=argp.print_c)
return 0
if __name__ == '__main__':
from sys import argv
exit(main(argv))