Skip to content

Commit

Permalink
Building the sparse matrix.
Browse files Browse the repository at this point in the history
  • Loading branch information
odarbelaeze committed Aug 22, 2015
1 parent d3bb20a commit 63241f3
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 4 deletions.
40 changes: 36 additions & 4 deletions metadata.py
@@ -1,9 +1,11 @@
import bisect
import collections
import contextlib
import glob
import itertools

import pymongo
import scipy.sparse

from xml.dom import minidom
from nltk.corpus import stopwords
Expand Down Expand Up @@ -46,6 +48,13 @@ def description(self):
return ''
return description.firstChild.nodeValue

@property
def location(self):
location = self.dom_element.getElementsByTagName('lom:location').item(0)
if location is None:
return ''
return location.firstChild.nodeValue

@property
def xml(self):
return self.dom_element.toxml()
Expand All @@ -56,6 +65,7 @@ def metadata(self):
'title': self.title,
'keywords': self.keywords,
'description': self.description,
'location': self.location,
'xml': self.xml,
}

Expand All @@ -65,13 +75,13 @@ def raw_data(self, fields=None):

def is_stopword(word):
i = bisect.bisect_left(STOPWORDS, word.lower())
if i < len(STOPWORDS) and word == STOPWORDS[i]:
if i < len(STOPWORDS) and word.lower() == STOPWORDS[i]:
return True
return False


def is_valid(word):
return not (is_stopword(word) or word.isspace()) and word.isalnum()
return not is_stopword(word) and not word.isspace() and word.isalnum()


def normalize(word):
Expand Down Expand Up @@ -111,14 +121,36 @@ def words():
return sorted(set(itertools.chain.from_iterable(words())))


def matrix(words):

word_dict = {word: pos for pos, word in enumerate(words)}

with collection(delete=False) as records:
for ind, record in enumerate(records.find()):
raw = list(raw_data(record))
frequency = collections.Counter(raw)
for word, freq in frequency.items():
yield ind, word_dict[word], freq


def main():
with collection() as records:

for filename in glob.glob('data/**/*.xml'):
records.insert_many(
list(metadata(filename)))

for word in word_set():
print(word)
words = word_set()
mat = matrix(words)
rowid, colid, freq = zip(*mat)
sparse = scipy.sparse.csr_matrix((freq, (rowid, colid)))
nrecs, nwords = sparse.shape

print('Number of records: ', nrecs)
print('Number of words: ', nwords)

print('sparcity: %f' % (sparse.nnz / (nwords * nrecs) * 100))


if __name__ == '__main__':
main()
2 changes: 2 additions & 0 deletions piprequirements.txt
Expand Up @@ -2,3 +2,5 @@ nltk==3.0.2
py==1.4.27
pymongo==3.0.2
pytest==2.7.0
numpy==1.9.2
scipy==0.16.0
21 changes: 21 additions & 0 deletions test/test_validators.py
@@ -0,0 +1,21 @@
from metadata import STOPWORDS
from metadata import is_stopword
from metadata import is_valid
import random


def test_is_stopword_works_with_upper_case_words():
word = random.choice(STOPWORDS)
assert is_stopword(word.upper())


def test_is_valid_yields_good_values():
words = {
'123123': True,
'askdfj': True,
' ': False,
'123asdf': True,
'\t': False,
}
for word, valid in words.items():
assert valid == is_valid(word)

0 comments on commit 63241f3

Please sign in to comment.