Building the sparse matrix.

odarbelaeze · Aug 22, 2015 · 63241f3 · 63241f3
1 parent d3bb20a
commit 63241f3
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 4 deletions.
diff --git a/metadata.py b/metadata.py
@@ -1,9 +1,11 @@
 import bisect
+import collections
 import contextlib
 import glob
 import itertools
 
 import pymongo
+import scipy.sparse
 
 from xml.dom import minidom
 from nltk.corpus import stopwords
@@ -46,6 +48,13 @@ def description(self):
             return ''
         return description.firstChild.nodeValue
 
+    @property
+    def location(self):
+        location = self.dom_element.getElementsByTagName('lom:location').item(0)
+        if location is None:
+            return ''
+        return location.firstChild.nodeValue
+
     @property
     def xml(self):
         return self.dom_element.toxml()
@@ -56,6 +65,7 @@ def metadata(self):
             'title': self.title,
             'keywords': self.keywords,
             'description': self.description,
+            'location': self.location,
             'xml': self.xml,
         }
 
@@ -65,13 +75,13 @@ def raw_data(self, fields=None):
 
 def is_stopword(word):
     i = bisect.bisect_left(STOPWORDS, word.lower())
-    if i < len(STOPWORDS) and word == STOPWORDS[i]:
+    if i < len(STOPWORDS) and word.lower() == STOPWORDS[i]:
         return True
     return False
 
 
 def is_valid(word):
-    return not (is_stopword(word) or word.isspace()) and word.isalnum()
+    return not is_stopword(word) and not word.isspace() and word.isalnum()
 
 
 def normalize(word):
@@ -111,14 +121,36 @@ def words():
     return sorted(set(itertools.chain.from_iterable(words())))
 
 
+def matrix(words):
+
+    word_dict = {word: pos for pos, word in enumerate(words)}
+
+    with collection(delete=False) as records:
+        for ind, record in enumerate(records.find()):
+            raw = list(raw_data(record))
+            frequency = collections.Counter(raw)
+            for word, freq in frequency.items():
+                yield ind, word_dict[word], freq
+
+
 def main():
     with collection() as records:
+
         for filename in glob.glob('data/**/*.xml'):
             records.insert_many(
                 list(metadata(filename)))
 
-    for word in word_set():
-        print(word)
+    words = word_set()
+    mat = matrix(words)
+    rowid, colid, freq = zip(*mat)
+    sparse = scipy.sparse.csr_matrix((freq, (rowid, colid)))
+    nrecs, nwords = sparse.shape
+
+    print('Number of records: ', nrecs)
+    print('Number of words: ', nwords)
+
+    print('sparcity: %f' % (sparse.nnz / (nwords * nrecs) * 100))
+
 
 if __name__ == '__main__':
     main()
diff --git a/piprequirements.txt b/piprequirements.txt
@@ -2,3 +2,5 @@ nltk==3.0.2
 py==1.4.27
 pymongo==3.0.2
 pytest==2.7.0
+numpy==1.9.2
+scipy==0.16.0
diff --git a/test/test_validators.py b/test/test_validators.py
@@ -0,0 +1,21 @@
+from metadata import STOPWORDS
+from metadata import is_stopword
+from metadata import is_valid
+import random
+
+
+def test_is_stopword_works_with_upper_case_words():
+    word = random.choice(STOPWORDS)
+    assert is_stopword(word.upper())
+
+
+def test_is_valid_yields_good_values():
+    words = {
+        '123123': True,
+        'askdfj': True,
+        ' ': False,
+        '123asdf': True,
+        '\t': False,
+    }
+    for word, valid in words.items():
+        assert valid == is_valid(word)