Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Merge branch 'mmm_fix' of github.com:neara/Open-Knesset into mmm

  • Loading branch information...
commit 17f3c1793e43445aaf402229abb93e774ba34fce 2 parents 443bea0 + 2548980
@ofri authored
View
48 src/knesset/mmm/management/commands/update_mmm.py
@@ -1,14 +1,50 @@
+from datetime import datetime
+
from django.core.management.base import NoArgsCommand, CommandError
+from django.utils import simplejson
+
from knesset.mmm.models import Document
from knesset.settings import DATA_ROOT
-import simplejson
+def parse_json(str):
+ """ receives fp.read result, loads/parses and returns a list of dictionaries """
+
+ result = simplejson.loads(str)
+
+ # translating date strings to datetime objects
+ for o in result:
+ o['date'] = datetime.strptime(o['date'], '%d/%m/%Y')
+
+ return result
+
+def combine_jsons(matches, mmm):
+ """
+ params:
+ matches - fp.read() result of matches.json
+ mmm - fp.read() result of mmm.json
+
+ returns:
+ json - list of matches objects with added authors from mmm.
+ """
+ # parse matches.json and mmm.json to python obj's
+ json = parse_json(matches)
+ mmm = simplejson.loads(mmm)
+
+ # create a dictionary of urls - authors
+ authors = dict((o['url'], ', '.join(o['authors'])) for o in mmm)
+
+ # modifying matches to include author field
+ for i in json:
+ i['author'] = authors[i['url']]
+
+ return json
+
class Command(NoArgsCommand):
help = "Updating mmm table"
-
+
+
def handle_noargs(self, **options):
-
- json = open(DATA_ROOT + 'mmm_matches.json')
-
- Document.objects.from_json(json)
+ json1 = open(DATA_ROOT + 'matches.json', 'rt').read()
+ json2 = open(DATA_ROOT + 'mmm.json', 'rt').read()
+ Document.objects.from_json(combine_jsons(json1, json2))
View
55 src/knesset/mmm/models.py
@@ -1,44 +1,34 @@
-from django.db import models
-from datetime import datetime
-from knesset.mks.models import Member
-from knesset.committees.models import Committee
-import simplejson
-import re
import logging
+import difflib
-logger = logging.getLogger("open-knesset.mmm.models")
+from django.db import models
+from django.utils import simplejson
-def parse_json(fp):
- """ recieves fp from data folder, loads/parses and returns a list of dictionaries """
-
- result = simplejson.load(fp)
-
- # modifying the data to be suitable for use
- for o in result:
- o['candidates'] = re.sub(r"\s+", r" " , " ".join(o['candidates']))
- o['date'] = datetime.strptime(o['date'], '%d/%m/%Y')
-
- return result
+from knesset.mks.models import Member
+from knesset.committees.models import Committee
+logger = logging.getLogger("open-knesset.mmm.models")
def text_lookup(modelName, text):
- """ recieves a text and a modelName and returns a list of modelName objects found in it"""
+ """receives a text and a modelName and returns a list of modelName objects found in the text"""
result = []
-
- # a list of all modelName objects
- all_obj = [(m.id, m.name) for m in modelName.objects.all()]
-
- for k, v in all_obj:
- if v in text:
- result.append(k)
+
+ for m in modelName.objects.all():
+ if m.name in text:
+ result.append(m.id)
+ else:
+ k = difflib.SequenceMatcher(None, m.name, text)
+ if k.ratio() >= 0.6:
+ logger.warning('No exact match found. Performing fuzzy matching!')
+ result.append(m.id)
return result
#from json helper function
def verify(o, i, mks, committees):
- if i[0].title == o['title'] and i[0].publication_date == o['date'] and i[0].author_names == 0['author']:
+ if i[0].title == o['title'] and i[0].publication_date == o['date'] and i[0].author_names == o['author']:
if i[0].req_mks == mks or i[0].req_committees == committees:
logger.info("%s already exists in db" % o['url'])
return True
@@ -53,16 +43,13 @@ def verify(o, i, mks, committees):
class DocumentManager(models.Manager):
def from_json(self, j):
- """Read a json j, and create Document instances based on it"""
- # info from m.m.m site
- info = parse_json(j)
-
+
# checking if the db already has document o instance and if no, creating one
- for o in info:
+ for o in j:
i = self.filter(url=o['url'])
- mks = text_lookup(Member, o['candidates'])
- committees = text_lookup(Committee, o['candidates'])
+ mks = text_lookup(Member, o['heading'])
+ committees = text_lookup(Committee, o['heading'])
# db verification
if i.exists():
View
31 src/knesset/mmm/test_matches.json
@@ -0,0 +1,31 @@
+[
+ {
+ "docid": "m02254",
+ "title": "\u05ea\u05d9\u05e2\u05d5\u05d3 \u05d7\u05d6\u05d5\u05ea\u05d9 \u05d5\u05e7\u05d5\u05dc\u05d9 \u05e9\u05dc \u05d7\u05e7\u05d9\u05e8\u05ea \u05d7\u05e9\u05d5\u05d3\u05d9\u05dd",
+ "url": "http://knesset.gov.il/mmm/data/pdf/m02254.pdf",
+ "entityName": "\u05d5\u05e2\u05d3\u05ea \u05d4\u05d7\u05d5\u05e7\u05d4 \u05d7\u05d5\u05e7 \u05d5\u05de\u05e9\u05e4\u05d8",
+ "heading": "\u05de\u05e1\u05de\u05da \u05d6\u05d4 \u05e0\u05db\u05ea\u05d1 \u05dc\u05e7\u05e8\u05d0\u05ea \u05d3\u05d9\u05d5\u05df \u05d5\u05e2\u05d3\u05ea \u05d4\u05d7\u05d5\u05e7\u05d4 \u05d7\u05d5\u05e7 \u05d5\u05de\u05e9\u05e4\u05d8 \u05e9\u05dc \u05d4\u05db\u05e0\u05e1\u05ea \u05d1\u05e1\u05e2\u05d9\u05e3 23 \u05dc\u05d4\u05e6\u05e2\u05ea \u05d7\u05d5\u05e7 \u05d4\u05d4\u05ea\u05d9\u05d9\u05e2\u05dc\u05d5\u05ea \u05d4\u05db\u05dc\u05db\u05dc\u05d9\u05ea \u05ea\u05d9\u05e7\u05d5\u05e0\u05d9 \u05d7\u05e7\u05d9\u05e7\u05d4 \u05dc\u05d9\u05d9\u05e9\u05d5\u05dd \u05d4\u05ea\u05d5\u05db\u05e0\u05d9\u05ea \u05d4\u05db\u05dc\u05db\u05dc\u05d9\u05ea \u05dc\u05e9\u05e0\u05d9\u05dd 9002 \u05d5 0102 \u05d4\u05ea\u05e9\u05e1\u05d8 90021 \u05dc\u05d4\u05dc\u05df \u05d4\u05e6\u05e2\u05ea \u05d4\u05d7\u05d5\u05e7",
+ "score": 100,
+ "date": "2/7/2009",
+ "id": 10005
+ },
+ {
+ "docid": "m01310",
+ "title": "\u05ea\u05d5\u05db\u05e0\u05d9\u05d5\u05ea \u05e1\u05d9\u05d5\u05e2 \u05dc\u05de\u05d5\u201d\u05e4 \u05ea\u05e2\u05e9\u05d9\u05d9\u05ea\u05d9 ",
+ "url": "http://knesset.gov.il/mmm/data/pdf/m01310.pdf",
+ "entityName": "\u05d5\u05e2\u05d3\u05ea \u05d4\u05de\u05d3\u05e2 \u05d5\u05d4\u05d8\u05db\u05e0\u05d5\u05dc\u05d5\u05d2\u05d9\u05d4",
+ "heading": "\u05de\u05e1\u05de\u05da \u05d6\u05d4 \u05e0\u05db\u05ea\u05d1 \u05d5\u05e2\u05d3\u05ea \u05d4\u05de\u05d3\u05e2 \u05d5\u05d4\u05d8\u05db\u05e0\u05d5\u05dc\u05d5\u05d2\u05d9\u05d4 \u05dc\u05e7\u05e8\u05d0\u05ea \u05d3\u05d9\u05d5\u05df \u05d1\u05e0\u05d5\u05e9\u05d0 \u05ea\u05d5\u05db\u05e0\u05d9\u05d5\u05ea \u05d4\u05e1\u05d9\u05d5\u05e2 \u05dc\u05de\u05d5\u05e4 \u05ea\u05e2\u05e9\u05d9\u05d9\u05ea\u05d9 \u05d5\u05e4\u05e2\u05d9\u05dc\u05d5\u05ea \u05d5\u05e2\u05d3\u05ea \u05d4\u05de\u05d7\u05e7\u05e8 \u05d1\u05de\u05e1\u05de\u05da \u05de\u05d5\u05e6\u05d2\u05d9\u05dd \u05ea\u05e7\u05e6\u05d9\u05d1\u05d9 \u05d4\u05de\u05d3\u05e2\u05df \u05d4\u05e8\u05d0\u05e9\u05d9 \u05d1\u05de\u05e9\u05e8\u05d3 \u05d4\u05ea\u05e2\u05e9\u05d9\u05d9\u05d4 \u05d4\u05de\u05e1\u05d7\u05e8 \u05d5\u05d4\u05ea\u05e2\u05e1\u05d5\u05e7\u05d4 \u05e4\u05e2\u05d9\u05dc\u05d5\u05ea \u05d5\u05e2\u05d3\u05ea \u05d4\u05de\u05d7\u05e7\u05e8 \u05e9\u05dc \u05d4\u05de\u05d3\u05e2\u05df \u05d4\u05e8\u05d0\u05e9\u05d9 \u05d5\u05ea\u05d5\u05db\u05e0\u05d9\u05d5\u05ea \u05d4\u05e1\u05d9\u05d5\u05e2 \u05d4\u05e9\u05d5\u05e0\u05d5\u05ea \u05dc\u05de\u05d5\u05e4 \u05ea\u05e2\u05e9\u05d9\u05d9\u05ea\u05d9",
+ "score": 100,
+ "date": "14/11/2005",
+ "id": 10008
+ },
+ {
+ "docid": "m00028",
+ "title": "\u05d7\u05d5\u05d1\u05ea \u05d3\u05d5\u05d5\u05d7 \u05e2\u05dc \u05d0\u05dc\u05d9\u05de\u05d5\u05ea \u05d1\u05d9\u05df \u05d1\u05e0\u05d9 \u05d6\u05d5\u05d2",
+ "url": "http://knesset.gov.il/mmm/data/pdf/m00028.pdf",
+ "entityName": "\u05d5\u05e2\u05d3\u05d4 \u05dc\u05e7\u05d9\u05d3\u05d5\u05dd \u05de\u05e2\u05de\u05d3 \u05d4\u05d0\u05d9\u05e9\u05d4",
+ "heading": "\u05de\u05d5\u05d2\u05e9 \u05d5\u05e2\u05d3\u05d4 \u05dc\u05e7\u05d9\u05d3\u05d54 \u05de\u05e2\u05de\u05d3 \u05d4\u05d0\u05d9\u05e9\u05d4 ",
+ "score": 95,
+ "date": "12/3/2001",
+ "id": 10011
+ }]
View
22 src/knesset/mmm/test_mmm.json
@@ -0,0 +1,22 @@
+[{
+ "url": "http://knesset.gov.il/mmm/data/pdf/m02254.pdf",
+ "date": "2/7/2009",
+ "authors": [
+ "\u05d3\u05d9\u05e0\u05d4 \u05e6\u05d3\u05d5\u05e7"
+ ],
+ "title": "\u05ea\u05d9\u05e2\u05d5\u05d3 \u05d7\u05d6\u05d5\u05ea\u05d9 \u05d5\u05e7\u05d5\u05dc\u05d9 \u05e9\u05dc \u05d7\u05e7\u05d9\u05e8\u05ea \u05d7\u05e9\u05d5\u05d3\u05d9\u05dd"
+ }, {
+ "url": "http://knesset.gov.il/mmm/data/pdf/m01310.pdf",
+ "date": "14/11/2005",
+ "authors": [
+ "\u05d0\u05d5\u05e8\u05dc\u05d9 \u05dc\u05d5\u05d8\u05df"
+ ],
+ "title": "\u05ea\u05d5\u05db\u05e0\u05d9\u05d5\u05ea \u05e1\u05d9\u05d5\u05e2 \u05dc\u05de\u05d5\u201d\u05e4 \u05ea\u05e2\u05e9\u05d9\u05d9\u05ea\u05d9 "
+ }, {
+ "url": "http://knesset.gov.il/mmm/data/pdf/m00028.pdf",
+ "date": "12/3/2001",
+ "authors": [
+ "\u05e8\u05d7\u05dc \u05d5\u05e8\u05e6\u05d1\u05e8\u05d2\u05e8 "
+ ],
+ "title": "\u05d7\u05d5\u05d1\u05ea \u05d3\u05d5\u05d5\u05d7 \u05e2\u05dc \u05d0\u05dc\u05d9\u05de\u05d5\u05ea \u05d1\u05d9\u05df \u05d1\u05e0\u05d9 \u05d6\u05d5\u05d2"
+ }]
View
64 src/knesset/mmm/tests.py
@@ -2,14 +2,17 @@
from knesset.settings import PROJECT_ROOT
from knesset.mks.models import Member
from knesset.committees.models import Committee
-from knesset.mmm.models import Document, parse_json, text_lookup, verify
+from knesset.mmm.models import Document, text_lookup, verify
+from knesset.mmm.management.commands.update_mmm import parse_json, combine_jsons
from datetime import datetime
import simplejson
-import re
-FP = open(PROJECT_ROOT + "/mmm/test_matches.json")
-JSON = simplejson.load(FP)
-OK_CANDIDATES = re.sub(r"\s+", r" " , " ".join(JSON[0]['candidates']))
+
+matches = PROJECT_ROOT + "/mmm/test_matches.json"
+mmm = PROJECT_ROOT + "/mmm/test_mmm.json"
+JSON = simplejson.load(open(matches, 'rt'))
+MMM = simplejson.load(open(mmm, 'rt'))
+OK_CANDIDATES = JSON[0]['entityName']
OK_DATE = datetime.strptime(JSON[0]['date'], '%d/%m/%Y')
class MmmTest(TestCase):
@@ -20,49 +23,16 @@ def test_parse_json(self):
Tests data modification
"""
- j = parse_json(FP)
-
-
- self.assertEqual(OK_CANDIDATES, j[0]['candidates'])
- self.assertEqual(OK_DATE, j[0]['date'])
-
- def test_text_lookup(self):
- """
- Tests in text look up for mks and committees names
- """
- mk_name = u'\u05e8\u05d5\u05e0\u05d9\u05ea \u05ea\u05d9\u05e8\u05d5\u05e9'
- mks = Member.objects.create(name=mk_name)
- c = Committee.objects.create(name='c1')
-
-
- self.assertIn(mk_name, OK_CANDIDATES)
+ j = parse_json(open(matches).read())
- mk = text_lookup(Member, OK_CANDIDATES)
- self.assertEqual(mks.id, mk[0])
+ for o in j:
+ self.assertTrue(type(o['date'] is datetime.date))
- self.assertNotIn('c1', OK_CANDIDATES)
+ def test_combine_jsons(self):
+ json1 = open(mmm, 'rt').read()
+ json2 = open(matches, 'rt').read()
+ j = combine_jsons(json2, json1)
- committees = text_lookup(Committee, OK_CANDIDATES)
- self.assertEqual([], committees)
+ self.assertTrue(isinstance(j[0]['author'], basestring))
- def test_verify(self):
- """
- Tests verify method
- """
- o = JSON[0]
- mk_name = u'\u05e8\u05d5\u05e0\u05d9\u05ea \u05ea\u05d9\u05e8\u05d5\u05e9'
- mks = Member.objects.create(name=mk_name)
- mkses = [1]
- committees = []
- d1 = Document.objects.create(url=JSON[0]['url'], title=JSON[0]['title'],
- publication_date=OK_DATE, author_names=JSON[0]['author'])
- d1.req_mks.add(mks.id)
-
- self.assertEqual(o['url'], d1.url)
- self.assertEqual(o['title'], d1.title)
- self.assertEqual(OK_DATE, d1.publication_date)
- self.assertEqual(o['author'], d1.author_names)
- self.assertEqual(1, d1.req_mks.count())
- self.assertEqual(0, d1.req_committee.count())
-
- self.assertTrue(verify(o, [d1], mkses, committees))
+ def test_text_lookup(self):

0 comments on commit 17f3c17

Please sign in to comment.
Something went wrong with that request. Please try again.