/
distribFeat.py
101 lines (91 loc) · 3.32 KB
/
distribFeat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from __future__ import division
import pickle
import numpy
import math
from nltk.tokenize import RegexpTokenizer
from sklearn.decomposition import NMF, TruncatedSVD
import sentenceFeatures
# Obtain distributional features ((2 * K) in number)
# IMPORTANT: both training and test set must be present in sentences
# sentences is an array of tokenized sentences (matrix of words, basically)
# fullSent is the untokenized version
def distribFeat(fullSent, sentences, K):
paraphraseMap = pickle.load(open("paraphraseMap", "rb"))
notParaphrMap = pickle.load(open("notParaphrMap", "rb"))
n = len(sentences)
uniqWords = []
for s in sentences:
for word in s:
if word not in uniqWords:
uniqWords.append(word)
# M will hold TF-KLD score for each word for each sentence
M = numpy.zeros((len(uniqWords), n))
for word in uniqWords:
if word in paraphraseMap:
if word in notParaphrMap:
p = paraphraseMap[word]
np = 1 - p
q = notParaphrMap[word]
nq = 1 - q
kl = p * math.log(p/q) + np * math.log(np/nq)
else:
kl = 1
else:
kl = 0
for i in range(0,n):
if word in sentences[i]:
M[uniqWords.index(word)][i] += kl
# Step 2: Matrix factorization
#factory = TruncatedSVD(n_components = K)
factory = NMF(n_components = K, max_iter=2000)
W = factory.fit_transform(M) # M = W*H , returns W, which we don't need
H = factory.components_ # should be size K * n
print(M.shape)
print(W.shape)
print(H.shape)
#Step 3: obtain feature set for paraphrase pair
features = []
i = 0
while i < n:
feat = [0] * (K * 2)
for j in range(0, K):
feat[j] = H[j][i] + H[j][i + 1]
feat[j * 2] = abs(H[j][i] - H[j][i + 1])
#feat.extend(sentenceFeatures.compute(fullSent[i],fullSent[i+1]))
i += 2 # step to next pair of sentences
features.append(feat)
return features
def getData():
tokenizer = RegexpTokenizer(r'\w+')
f = open("msr_paraphrase_train.txt", "r")
f.readline()
sentences = []
sentencesWords = []
trainClass = [0] * 4076
for i in range(0,4076):
tokens = f.readline().strip().split('\t')
trainClass[i] = int(tokens[0])
#sentences.append(tokens[3].lower())
#sentences.append(tokens[4].lower())
sentencesWords.append(tokenizer.tokenize(tokens[3].lower()))
sentencesWords.append(tokenizer.tokenize(tokens[4].lower()))
f.close()
#trainFeat = distribFeat(sentences, sentencesWords, 500)
f = open("msr_paraphrase_test.txt", "r")
f.readline()
#sentences = []
#sentencesWords = []
testClass = [0] * 1725
for i in range(0,1725):
tokens = f.readline().strip().split('\t')
testClass[i] = int(tokens[0])
#sentences.append(tokens[3].lower())
#sentences.append(tokens[4].lower())
sentencesWords.append(tokenizer.tokenize(tokens[3].lower()))
sentencesWords.append(tokenizer.tokenize(tokens[4].lower()))
f.close()
allFeat = distribFeat(sentences, sentencesWords, 50)
print(len(allFeat))
trainFeat = allFeat[:4076]
testFeat = allFeat[4076:]
return trainFeat, trainClass, testFeat, testClass