Skip to content

Commit

Permalink
hopeless...
Browse files Browse the repository at this point in the history
  • Loading branch information
Matei Macri committed May 16, 2016
1 parent ae3966a commit c6334ee
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 16 deletions.
34 changes: 20 additions & 14 deletions distribFeat.py
Expand Up @@ -6,8 +6,10 @@
from sklearn.decomposition import NMF, TruncatedSVD
import sentenceFeatures

# Obtain distributional features ((2 * K) in number)
# IMPORTANT: both training and test set must be present in sentences
# sentences is an array of tokenized sentences (matrix of words, basically)
# K is the number of distributional features we'll have at the end
# fullSent is the untokenized version
def distribFeat(fullSent, sentences, K):
paraphraseMap = pickle.load(open("paraphraseMap", "rb"))
notParaphrMap = pickle.load(open("notParaphrMap", "rb"))
Expand Down Expand Up @@ -38,10 +40,13 @@ def distribFeat(fullSent, sentences, K):
M[uniqWords.index(word)][i] += kl

# Step 2: Matrix factorization
factory = TruncatedSVD(n_components = K)
#factory = NMF(n_components = K)
factory.fit_transform(M) # M = W*H , returns W, which we don't need
#factory = TruncatedSVD(n_components = K)
factory = NMF(n_components = K, max_iter=2000)
W = factory.fit_transform(M) # M = W*H , returns W, which we don't need
H = factory.components_ # should be size K * n
print(M.shape)
print(W.shape)
print(H.shape)

#Step 3: obtain feature set for paraphrase pair
features = []
Expand All @@ -51,8 +56,6 @@ def distribFeat(fullSent, sentences, K):
for j in range(0, K):
feat[j] = H[j][i] + H[j][i + 1]
feat[j * 2] = abs(H[j][i] - H[j][i + 1])
if feat[j] > 0.1:
print(str(feat[j])+" "+str(feat[j*2]))
#feat.extend(sentenceFeatures.compute(fullSent[i],fullSent[i+1]))
i += 2 # step to next pair of sentences
features.append(feat)
Expand All @@ -69,27 +72,30 @@ def getData():
for i in range(0,4076):
tokens = f.readline().strip().split('\t')
trainClass[i] = int(tokens[0])
sentences.append(tokens[3].lower())
sentences.append(tokens[4].lower())
#sentences.append(tokens[3].lower())
#sentences.append(tokens[4].lower())
sentencesWords.append(tokenizer.tokenize(tokens[3].lower()))
sentencesWords.append(tokenizer.tokenize(tokens[4].lower()))

f.close()
trainFeat = distribFeat(sentences, sentencesWords, 200)
#trainFeat = distribFeat(sentences, sentencesWords, 500)

f = open("msr_paraphrase_test.txt", "r")
f.readline()
sentences = []
sentencesWords = []
#sentences = []
#sentencesWords = []
testClass = [0] * 1725
for i in range(0,1725):
tokens = f.readline().strip().split('\t')
testClass[i] = int(tokens[0])
sentences.append(tokens[3].lower())
sentences.append(tokens[4].lower())
#sentences.append(tokens[3].lower())
#sentences.append(tokens[4].lower())
sentencesWords.append(tokenizer.tokenize(tokens[3].lower()))
sentencesWords.append(tokenizer.tokenize(tokens[4].lower()))

f.close()
testFeat = distribFeat(sentences, sentencesWords, 200)
allFeat = distribFeat(sentences, sentencesWords, 50)
print(len(allFeat))
trainFeat = allFeat[:4076]
testFeat = allFeat[4076:]
return trainFeat, trainClass, testFeat, testClass
4 changes: 2 additions & 2 deletions main.py
Expand Up @@ -193,8 +193,8 @@ def readData():

return trainFeat, trainClass, testFeat, testClass

trainFeat, trainClass, testFeat, testClass = distribFeat.getData()
#trainFeat, trainClass, testFeat, testClass = readData()
#trainFeat, trainClass, testFeat, testClass = distribFeat.getData()
trainFeat, trainClass, testFeat, testClass = readData()
#pickle.dump(trainFeat, open('trainFeat', 'wb'))
#pickle.dump(trainClass, open('trainClass', 'wb'))
#pickle.dump(testFeat, open('testFeat', 'wb'))
Expand Down

0 comments on commit c6334ee

Please sign in to comment.