forked from shockline/KnowlegeableCNN
-
Notifications
You must be signed in to change notification settings - Fork 1
/
DocEmbeddingNNOneDoc.py
89 lines (77 loc) · 3.99 KB
/
DocEmbeddingNNOneDoc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from theano import tensor as T, printing
import theano
import theano.tensor.signal.downsample as downsample
import theano.tensor.signal.conv as conv
import numpy
class DocEmbeddingNNOneDoc:
def __init__(self,
corpus,
sentenceWordCount,
rng,
wordEmbeddingDim,
sentenceLayerNodesNum=2,
sentenceLayerNodesSize=(2, 2),
docLayerNodesNum=2,
docLayerNodesSize=(2, 3),
datatype=theano.config.floatX):
self.__wordEmbeddingDim = wordEmbeddingDim
self.__sentenceLayerNodesNum = sentenceLayerNodesNum
self.__sentenceLayerNodesSize = sentenceLayerNodesSize
self.__docLayerNodesNum = docLayerNodesNum
self.__docLayerNodesSize = docLayerNodesSize
self.__WBound = 0.2
self.__MAXDIM = 10000
self.__datatype = datatype
self.sentenceW = None
self.sentenceB = None
self.docW = None
self.docB = None
# For DomEmbeddingNN optimizer.
# self.shareRandge = T.arange(maxRandge)
# Get sentence layer W
self.sentenceW = theano.shared(
numpy.asarray(
rng.uniform(low=-self.__WBound, high=self.__WBound, size=(self.__sentenceLayerNodesNum, self.__sentenceLayerNodesSize[0], self.__sentenceLayerNodesSize[1])),
dtype=datatype
),
borrow=True
)
# Get sentence layer b
sentenceB0 = numpy.zeros((sentenceLayerNodesNum,), dtype=datatype)
self.sentenceB = theano.shared(value=sentenceB0, borrow=True)
# Get doc layer W
self.docW = theano.shared(
numpy.asarray(
rng.uniform(low=-self.__WBound, high=self.__WBound, size=(self.__docLayerNodesNum, self.__docLayerNodesSize[0], self.__docLayerNodesSize[1])),
dtype=datatype
),
borrow=True
)
# Get doc layer b
docB0 = numpy.zeros((docLayerNodesNum,), dtype=datatype)
self.docB = theano.shared(value=docB0, borrow=True)
self.sentenceResults, _ = theano.scan(fn=self.__dealWithSentence,
non_sequences=[corpus, self.sentenceW, self.sentenceB],
sequences=[dict(input=sentenceWordCount, taps=[-1, -0])],
strict=True)
# p = printing.Print('docPool')
# docPool = p(docPool)
# p = printing.Print('sentenceResults')
# sentenceResults = p(sentenceResults)
# p = printing.Print('doc_out')
# doc_out = p(doc_out)
doc_out = conv.conv2d(input=self.sentenceResults, filters=self.docW)
docPool = downsample.max_pool_2d(doc_out, (self.__MAXDIM, 1), mode="average_exc_pad", ignore_border=False)
docOutput = T.tanh(docPool + self.docB.dimshuffle([0, 'x', 'x']))
self.output = docOutput.flatten(1)
self.params = [self.sentenceW, self.sentenceB, self.docW, self.docB]
self.outputDimension = self.__docLayerNodesNum * \
(self.__sentenceLayerNodesNum * (self.__wordEmbeddingDim - self.__sentenceLayerNodesSize[1] + 1) - self.__docLayerNodesSize[1] + 1)
def __dealWithSentence(self, sentenceWordCount0, sentenceWordCount1, docs, sentenceW, sentenceB):
# t = T.and_((shareRandge < sentenceWordCount1), (shareRandge >= sentenceWordCount0)).nonzero()
sentence = docs[sentenceWordCount0:sentenceWordCount1]
sentence_out = conv.conv2d(input=sentence, filters=sentenceW)
sentence_pool = downsample.max_pool_2d(sentence_out, (self.__MAXDIM, 1), mode="average_exc_pad", ignore_border=False)
sentence_output = T.tanh(sentence_pool + sentenceB.dimshuffle([0, 'x', 'x']))
sentence_embedding = sentence_output.flatten(1)
return sentence_embedding