-
Notifications
You must be signed in to change notification settings - Fork 0
/
vectorise.py
executable file
·118 lines (96 loc) · 3.47 KB
/
vectorise.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python
# TODO: XXX: Clean up this file a bit
'''
Vectorise featurised data.
Author: Pontus Stenetorp <pontus stenetorp se>
Version: 2012-01-13
'''
from argparse import ArgumentParser, FileType
from atexit import register as atexit_register
from os.path import isfile
from sys import stdin, stdout, stderr
try:
from cPickle import (HIGHEST_PROTOCOL, dump as pickle_dump,
load as pickle_load)
except ImportError:
from pickle import (HIGHEST_PROTOCOL, dump as pickle_dump,
load as pickle_load)
### Constants
ARGPARSER = ArgumentParser()#XXX:
# TODO: Bad name really, it is fid to fname really
ARGPARSER.add_argument('catid_to_catname_pickle')
ARGPARSER.add_argument('fid_to_fname_pickle')
ARGPARSER.add_argument('-i', '--input', type=FileType('r'), default=stdin)
ARGPARSER.add_argument('-o', '--output', type=FileType('w'), default=stdout)
ARGPARSER.add_argument('-t', '--run-unittests', action='store_true')
# TODO: Don't save option!
###
# TODO: Dictionary class wrapper that detects an update, reset check when being unpickled
# TODO: Disallow assignment using other options than put
class IdMapper(dict):
# TODO: Fix constructor to conform with dict
def __init__(self, start_id=1):
self.next_id = 1
# Override missing for easy access
def __missing__(self, entry):
new_id = self.next_id
self[entry] = new_id
self.next_id += 1
return new_id
def _load_pickle(pickle_path, save=True):
if isfile(pickle_path):
try:
with open(pickle_path, 'rb') as pickle_file:
loaded_pickle = pickle_load(pickle_file)
except EOFError, e:
print >> stderr, "failed to load pickle %s: %s" % (pickle_path, e)
raise
else:
loaded_pickle = IdMapper()
if save:
atexit_register(_save_pickle_func(loaded_pickle, pickle_path))
return loaded_pickle
def _save_pickle_func(pickle, pickle_path):
def save_pickle():
with open(pickle_path, 'wb') as pickle_file:
pickle_dump(pickle, pickle_file, HIGHEST_PROTOCOL)
return save_pickle
def _tests():
from tempfile import NamedTemporaryFile
from os import remove
tmp_path = None
try:
tmp_path = NamedTemporaryFile(delete=False).name
mapper = IdMapper()
mapper.put('foo')
_save_pickle_func(mapper, tmp_path)()
mapper = _load_pickle(tmp_path, save=False)
assert mapper['foo'] == 1
mapper.put('bar')
assert mapper['bar'] == 2
finally:
if tmp_path is not None:
remove(tmp_path)
return 0
def main(args):
argp = ARGPARSER.parse_args(args[1:])
if argp.run_unittests:
return _tests()
catid_to_catname = _load_pickle(argp.catid_to_catname_pickle)
fid_to_fname = _load_pickle(argp.fid_to_fname_pickle)
for line in (l.rstrip('\n') for l in argp.input):
f_type, f_data = line.split('\t')
f_type_id = catid_to_catname[f_type]
f_vec = {}
#print f_data
for f_name, f_val in ((n, float(v))
for n, v in (t.rsplit(':', 1)
for t in f_data.split(' '))):
#print f_name, f_val
f_vec[fid_to_fname[f_name]] = f_val
argp.output.write('{0} {1}\n'.format(f_type_id, ' '.join(
'{0}:{1}'.format(f_id, f_vec[f_id]) for f_id in sorted(f_vec.keys()))))
#print f_type_id
if __name__ == '__main__':
from sys import argv
exit(main(argv))