forked from shockline/KnowlegeableCNN
-
Notifications
You must be signed in to change notification settings - Fork 1
/
structureTestOnMonster2OneDocTest.py
115 lines (98 loc) · 3.21 KB
/
structureTestOnMonster2OneDocTest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from theano import tensor as T, printing
import theano
import numpy
from mlp import HiddenLayer
from logistic_sgd import LogisticRegression
from DocEmbeddingNNOneDoc import DocEmbeddingNNOneDoc
# from DocEmbeddingNNPadding import DocEmbeddingNN
from knoweagebleClassifyFlattened import CorpusReader
import cPickle
import string
import codecs
def work(argv):
print "Started!"
rng = numpy.random.RandomState(23455)
sentenceWordCount = T.ivector("sentenceWordCount")
corpus = T.matrix("corpus")
docLabel = T.ivector('docLabel')
# for list-type data
layer0 = DocEmbeddingNNOneDoc(corpus, sentenceWordCount, rng, wordEmbeddingDim=200, \
sentenceLayerNodesNum=100, \
sentenceLayerNodesSize=[5, 200], \
docLayerNodesNum=100, \
docLayerNodesSize=[3, 100])
layer1 = HiddenLayer(
rng,
input=layer0.output,
n_in=layer0.outputDimension,
n_out=100,
activation=T.tanh
)
layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2)
cost = layer2.negative_log_likelihood(1 - layer2.y_pred)
grads = T.grad(cost, layer0.sentenceResults)
score = T.diag(T.dot(grads, T.transpose(layer0.sentenceResults)))
# construct the parameter array.
params = layer2.params + layer1.params + layer0.params
# Load the parameters last time, optionally.
loadParamsVal(params)
print "Compiling computing graph."
output_model = theano.function(
[corpus, sentenceWordCount],
[layer2.y_pred, score]
)
print "Compiled."
cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/train_valid/split")
count = 0
while(count <= 1000):
info = cr.getCorpus([count, count + 1])
count += 1
if info is None:
print "Pass"
continue
docMatrixes, _, sentenceWordNums, ids, sentences = info
docMatrixes = numpy.matrix(
docMatrixes,
dtype=theano.config.floatX
)
sentenceWordNums = numpy.array(
sentenceWordNums,
dtype=numpy.int32
)
print "start to predict: %s." % ids[0]
pred_y, g = output_model(docMatrixes, sentenceWordNums)
print "End predicting."
print "Writing resfile."
score_sentence_list = zip(g, sentences)
score_sentence_list.sort(key=lambda x:-x[0])
with codecs.open("data/output/" + str(pred_y[0]) + "/" + ids[0], "w", 'utf-8', "ignore") as f:
f .write("pred_y: %i\n" % pred_y[0])
for g0, s in score_sentence_list:
f.write("%f\t%s\n" % (g0, string.join(s, " ")))
# print zip(ids, pred_y[0])
# f = file("data/test/res/res" + str(count), "w")
# f.write(str(zip(ids, pred_y[0])))
# f.close()
print "Written." + str(count)
print "All finished!"
def saveParamsVal(params):
with open("model/scnn.model", 'wb') as f: # open file with write-mode
for para in params:
cPickle.dump(para.get_value(), f, protocol=cPickle.HIGHEST_PROTOCOL) # serialize and save object
def loadParamsVal(params):
try:
with open("model/scnn.model", 'rb') as f: # open file with write-mode
for para in params:
para.set_value(cPickle.load(f), borrow=True)
except:
pass
def transToTensor(data, t):
return theano.shared(
numpy.array(
data,
dtype=t
),
borrow=True
)
if __name__ == '__main__':
work("train")