-
Notifications
You must be signed in to change notification settings - Fork 53
/
dan_sentiment.py
262 lines (205 loc) · 9.25 KB
/
dan_sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
from numpy import *
from util.sentiment_util import *
from util.math_util import *
from util.adagrad import Adagrad
import cPickle, time, argparse
from collections import Counter
# compute model accuracy on a given fold
def validate(data, fold, params, deep, f=relu):
correct = 0.
total = 0.
for sent, label in data:
if len(sent) == 0:
continue
av = average(params[-1][:, sent], axis=1)
# forward prop
acts = zeros((deep, dh))
for i in range(0, deep):
start = i * 2
prev = av if i == 0 else acts[i - 1]
acts[i] = f(params[start].dot(prev) + params[start + 1])
Ws = params[deep * 2]
bs = params[deep * 2 + 1]
if deep == 0:
pred = softmax(Ws.dot(av) + bs).ravel()
else:
pred = softmax(Ws.dot(acts[-1]) + bs).ravel()
if argmax(pred) == label:
correct += 1
total += 1
print 'accuracy on ', fold, correct, total, str(correct / total), '\n'
return correct / total
# does both forward and backprop
def objective_and_grad(data, params, d, dh, len_voc, deep, labels, f=relu, df=drelu, compute_grad=True, word_drop=0.3, rho=1e-4, fine_tune=True):
params = unroll_params(params, d, dh, len_voc, deep=deep, labels=labels)
grads = init_grads(d, dh, len_voc, deep=deep, labels=labels)
error_sum = 0.0
for sent,label in data:
if len(sent) == 0:
continue
# store each layer's normalized and unnormalized acts
acts = zeros((deep, dh))
target = zeros(labels)
target[label] = 1.0
# input is average of all nouns in sentence
curr_sent = []
mask = random.rand(len(sent)) > word_drop
for index, keep in enumerate(mask):
if keep:
curr_sent.append(sent[index])
# all examples must have at least one word
if len(curr_sent) == 0:
curr_sent = sent
av = average(params[-1][:, curr_sent], axis=1)
# forward prop
for i in range(0, deep):
start = i * 2
prev = av if i == 0 else acts[i - 1]
acts[i] = f(params[start].dot(prev) + params[start + 1])
# compute softmax error
Ws = params[deep * 2]
bs = params[deep * 2 + 1]
if deep == 0:
pred = softmax(Ws.dot(av) + bs).ravel()
error_sum += crossent(target, pred)
soft_delta = dcrossent(target, pred)
grads[deep * 2] += outer(soft_delta, av)
grads[deep * 2 + 1] += soft_delta
delta = Ws.T.dot(soft_delta)
if fine_tune:
grads[-1][:, curr_sent] += delta.reshape((d, 1)) / len(curr_sent)
else:
pred = softmax(Ws.dot(acts[-1]) + bs).ravel()
error_sum += crossent(target, pred)
soft_delta = dcrossent(target, pred)
grads[deep * 2] += outer(soft_delta, acts[-1])
grads[deep * 2 + 1] += soft_delta
# backprop
prev_delta = Ws.T.dot(soft_delta)
for i in range(deep - 1, -1, -1):
start = i * 2
deriv = df(acts[i])
delta = deriv * prev_delta
if i > 0:
grads[start] += outer(delta, acts[i-1])
grads[start + 1] += delta
prev_delta = params[start].T.dot(delta)
else:
grads[0] += outer(delta, av)
grads[1] += delta
if fine_tune:
grads[-1][:, curr_sent] += params[0].T.dot(delta).reshape((d, 1)) / len(curr_sent)
for index in range(0, len(params)):
# don't regularize embeddings if finetune=false
if not fine_tune and index == len(params) - 1:
continue
error_sum += 0.5 * rho * sum(params[index] ** 2)
grads[index] += rho * params[index]
cost = error_sum / len(data)
grad = roll_params(grads) / len(data)
if compute_grad:
return cost, grad
else:
return cost
if __name__ == '__main__':
# command line arguments
parser = argparse.ArgumentParser(description='sentiment DAN')
parser.add_argument('-data', help='location of dataset', default='data/sentiment/')
parser.add_argument('-vocab', help='location of vocab', default='data/sentiment/wordMapAll.bin')
parser.add_argument('-We', help='location of word embeddings', default='data/sentiment_all_We')
parser.add_argument('-rand_We', help='randomly init word embeddings', type=int, default=0)
parser.add_argument('-binarize', help='binarize labels', type=int, default=0)
parser.add_argument('-d', help='word embedding dimension', type=int, default=300)
parser.add_argument('-dh', help='hidden dimension', type=int, default=300)
parser.add_argument('-deep', help='number of layers', type=int, default=3)
parser.add_argument('-drop', help='dropout probability', type=float, default=0.3)
parser.add_argument('-rho', help='regularization weight', type=float, default=1e-4)
parser.add_argument('-labels', help='number of labels', type=int, default=5)
parser.add_argument('-ft', help='fine tune word vectors', type=int, default=1)
parser.add_argument('-b', '--batch_size', help='adagrad minibatch size (ideal: 25 minibatches \
per epoch). for provided datasets, x for history and y for lit', type=int,\
default=15)
parser.add_argument('-ep', '--num_epochs', help='number of training epochs, can also determine \
dynamically via validate method', type=int, default=5)
parser.add_argument('-agr', '--adagrad_reset', help='reset sum of squared gradients after this many\
epochs', type=int, default=50)
parser.add_argument('-lr', help='adagrad initial learning rate', type=float, default=0.005)
parser.add_argument('-o', '--output', help='desired location of output model', \
default='models/sentiment_params.pkl')
args = vars(parser.parse_args())
d = args['d']
dh = args['dh']
# load data
train = cPickle.load(open(args['data']+'train-rootfine', 'rb'))
dev = cPickle.load(open(args['data']+'dev-rootfine', 'rb'))
test = cPickle.load(open(args['data']+'test-rootfine', 'rb'))
vocab = cPickle.load(open(args['vocab'], 'rb'))
len_voc = len(vocab)
for split in [train, dev, test]:
c = Counter()
tot = 0
for sent, label in split:
c[label] += 1
tot += 1
print c, tot
if args['rand_We']:
print 'randomly initializing word embeddings...'
orig_We = (random.rand(d, len_voc) * 2 - 1) * 0.08
else:
print 'loading pretrained word embeddings...'
orig_We = cPickle.load(open(args['We'], 'rb'))
# output log and parameter file destinations
param_file = args['output']
log_file = param_file.split('_')[0] + '_log'
# generate params / We
params = init_params(d, dh, deep=args['deep'], labels=args['labels'])
# add We matrix to params
params += (orig_We, )
r = roll_params(params)
dim = r.shape[0]
print 'parameter vector dimensionality:', dim
log = open(log_file, 'w')
# minibatch adagrad training
ag = Adagrad(r.shape, args['lr'])
min_error = float('inf')
for epoch in range(0, args['num_epochs']):
lstring = ''
# create mini-batches
random.shuffle(train)
batches = [train[x : x + args['batch_size']] for x in xrange(0, len(train),
args['batch_size'])]
epoch_error = 0.0
ep_t = time.time()
for batch_ind, batch in enumerate(batches):
now = time.time()
err, grad = objective_and_grad(batch, r, d, dh, len_voc,
args['deep'], args['labels'], word_drop=args['drop'],
fine_tune=args['ft'], rho=args['rho'])
update = ag.rescale_update(grad)
r = r - update
lstring = 'epoch: ' + str(epoch) + ' batch_ind: ' + str(batch_ind) + \
' error, ' + str(err) + ' time = '+ str(time.time()-now) + ' sec'
log.write(lstring + '\n')
log.flush()
epoch_error += err
# done with epoch
print time.time() - ep_t
print 'done with epoch ', epoch, ' epoch error = ', epoch_error, ' min error = ', min_error
lstring = 'done with epoch ' + str(epoch) + ' epoch error = ' + str(epoch_error) \
+ ' min error = ' + str(min_error) + '\n'
log.write(lstring)
log.flush()
# save parameters if the current model is better than previous best model
if epoch_error < min_error:
min_error = epoch_error
params = unroll_params(r, d, dh, len_voc, deep = args['deep'], labels=args['labels'])
# d_score = validate(dev, 'dev', params, args['deep'])
cPickle.dump( params, open(param_file, 'wb'))
log.flush()
# reset adagrad weights
if epoch % args['adagrad_reset'] == 0 and epoch != 0:
ag.reset_weights()
log.close()
# compute test score
params = unroll_params(r, d, dh, len_voc, deep = args['deep'], labels=args['labels'])
t_score = validate(test, 'test', params, args['deep'])