hopeless...

metzzo · May 16, 2016 · c6334ee · c6334ee
1 parent ae3966a
commit c6334ee
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 16 deletions.
diff --git a/distribFeat.py b/distribFeat.py
@@ -6,8 +6,10 @@
 from sklearn.decomposition import NMF, TruncatedSVD
 import sentenceFeatures
 
+# Obtain distributional features ((2 * K) in number)
+# IMPORTANT: both training and test set must be present in sentences
 # sentences is an array of tokenized sentences (matrix of words, basically)
-# K is the number of distributional features we'll have at the end
+# fullSent is the untokenized version
 def distribFeat(fullSent, sentences, K):
     paraphraseMap = pickle.load(open("paraphraseMap", "rb"))
     notParaphrMap = pickle.load(open("notParaphrMap", "rb"))
@@ -38,10 +40,13 @@ def distribFeat(fullSent, sentences, K):
                 M[uniqWords.index(word)][i] += kl
 
     # Step 2: Matrix factorization
-    factory = TruncatedSVD(n_components = K)
-    #factory = NMF(n_components = K)
-    factory.fit_transform(M) # M = W*H , returns W, which we don't need
+    #factory = TruncatedSVD(n_components = K)
+    factory = NMF(n_components = K, max_iter=2000)
+    W = factory.fit_transform(M) # M = W*H , returns W, which we don't need
     H = factory.components_ # should be size K * n
+    print(M.shape)
+    print(W.shape)
+    print(H.shape)
 
     #Step 3: obtain feature set for paraphrase pair
     features = []
@@ -51,8 +56,6 @@ def distribFeat(fullSent, sentences, K):
         for j in range(0, K):
             feat[j] = H[j][i] + H[j][i + 1]
             feat[j * 2] = abs(H[j][i] - H[j][i + 1])
-            if feat[j] > 0.1:
-                print(str(feat[j])+" "+str(feat[j*2]))
         #feat.extend(sentenceFeatures.compute(fullSent[i],fullSent[i+1]))
         i += 2 # step to next pair of sentences
         features.append(feat)
@@ -69,27 +72,30 @@ def getData():
     for i in range(0,4076):
         tokens = f.readline().strip().split('\t')
         trainClass[i] = int(tokens[0])
-        sentences.append(tokens[3].lower())
-        sentences.append(tokens[4].lower())
+        #sentences.append(tokens[3].lower())
+        #sentences.append(tokens[4].lower())
         sentencesWords.append(tokenizer.tokenize(tokens[3].lower()))
         sentencesWords.append(tokenizer.tokenize(tokens[4].lower()))
 
     f.close()
-    trainFeat = distribFeat(sentences, sentencesWords, 200)
+    #trainFeat = distribFeat(sentences, sentencesWords, 500)
 
     f = open("msr_paraphrase_test.txt", "r")
     f.readline()
-    sentences = []
-    sentencesWords = []
+    #sentences = []
+    #sentencesWords = []
     testClass = [0] * 1725
     for i in range(0,1725):
         tokens = f.readline().strip().split('\t')
         testClass[i] = int(tokens[0])
-        sentences.append(tokens[3].lower())
-        sentences.append(tokens[4].lower())
+        #sentences.append(tokens[3].lower())
+        #sentences.append(tokens[4].lower())
         sentencesWords.append(tokenizer.tokenize(tokens[3].lower()))
         sentencesWords.append(tokenizer.tokenize(tokens[4].lower()))
 
     f.close()
-    testFeat = distribFeat(sentences, sentencesWords, 200)
+    allFeat = distribFeat(sentences, sentencesWords, 50)
+    print(len(allFeat))
+    trainFeat = allFeat[:4076]
+    testFeat = allFeat[4076:]
     return trainFeat, trainClass, testFeat, testClass
diff --git a/main.py b/main.py
@@ -193,8 +193,8 @@ def readData():
 
     return trainFeat, trainClass, testFeat, testClass
 
-trainFeat, trainClass, testFeat, testClass = distribFeat.getData()
-#trainFeat, trainClass, testFeat, testClass = readData()
+#trainFeat, trainClass, testFeat, testClass = distribFeat.getData()
+trainFeat, trainClass, testFeat, testClass = readData()
 #pickle.dump(trainFeat, open('trainFeat', 'wb'))
 #pickle.dump(trainClass, open('trainClass', 'wb'))
 #pickle.dump(testFeat, open('testFeat', 'wb'))