From b78a336ab9e471b4aa9fcd94a9ccdf8a106cba61 Mon Sep 17 00:00:00 2001 From: Justin Walgran Date: Fri, 25 Oct 2019 12:31:37 -0700 Subject: [PATCH 01/10] Add migration to create submit facility Waffle flag and group User accounts will need to be added to the `can_submit_facility` group in order to use the single facility submission endpoint. --- .../0040_add_submit_facility_flag.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 src/django/api/migrations/0040_add_submit_facility_flag.py diff --git a/src/django/api/migrations/0040_add_submit_facility_flag.py b/src/django/api/migrations/0040_add_submit_facility_flag.py new file mode 100644 index 000000000..6be7a5e4b --- /dev/null +++ b/src/django/api/migrations/0040_add_submit_facility_flag.py @@ -0,0 +1,26 @@ +from django.db import migrations + + +def create_can_submit_facility(apps, schema_editor): + Group = apps.get_model('auth', 'Group') + submit_facility_group = Group.objects.create(name='can_submit_facility') + + Flag = apps.get_model('waffle', 'Flag') + submit_facility_flag = Flag.objects.create( + name='can_submit_facility', + superusers=True, + staff=False, + note='Used for single facility submission API endpoint authorization') + + submit_facility_flag.groups.set([submit_facility_group]) + + +class Migration(migrations.Migration): + + dependencies = [ + ('api', '0039_delete_facility_history_switch'), + ] + + operations = [ + migrations.RunPython(create_can_submit_facility), + ] From 6e6ac5f932545535dd981c18b19548a3818e98d8 Mon Sep 17 00:00:00 2001 From: Justin Walgran Date: Fri, 25 Oct 2019 13:31:25 -0700 Subject: [PATCH 02/10] Add a create method to the FacilitiesViewSet Implementing the previously unused create action on the facilities resource is a natural fit for submitting a single facility record. This commit implements the required authentication but none of the actual logic. --- src/django/api/tests.py | 38 ++++++++++++++++++++++++++++++++++++++ src/django/api/views.py | 11 +++++++++++ 2 files changed, 49 insertions(+) diff --git a/src/django/api/tests.py b/src/django/api/tests.py index 127d24397..376ae79c2 100644 --- a/src/django/api/tests.py +++ b/src/django/api/tests.py @@ -4695,3 +4695,41 @@ def test_in_group_receives_200(self): history_response.status_code, 200, ) + + +class FacilitySubmitTest(FacilityAPITestCaseBase): + def setUp(self): + super(FacilitySubmitTest, self).setUp() + self.url = reverse('facility-list') + + def test_unauthenticated_receives_401(self): + self.client.logout() + response = self.client.post(self.url) + self.assertEqual(response.status_code, 401) + + def test_not_in_group_receives_403(self): + self.client.logout() + self.client.login(email=self.user_email, + password=self.user_password) + + response = self.client.post(self.url) + + self.assertEqual(response.status_code, 403) + + def test_in_group_receives_200(self): + self.client.logout() + + group = auth.models.Group.objects.get( + name='can_submit_facility', + ) + + self.user.groups.set([group.id]) + self.user.save() + + self.client.login(email=self.user_email, + password=self.user_password) + + response = self.client.post(self.url) + + # TODO: Change to 200 when implemented + self.assertEqual(response.status_code, 501) diff --git a/src/django/api/views.py b/src/django/api/views.py index 262ac528a..bb192c4ac 100644 --- a/src/django/api/views.py +++ b/src/django/api/views.py @@ -516,6 +516,7 @@ def get_link(self, path, method, base_url): class FacilitiesViewSet(mixins.ListModelMixin, mixins.RetrieveModelMixin, mixins.DestroyModelMixin, + mixins.CreateModelMixin, viewsets.GenericViewSet): """ Get facilities in GeoJSON format. @@ -623,6 +624,16 @@ def retrieve(self, request, pk=None): except Facility.DoesNotExist: raise NotFound() + @transaction.atomic + def create(self, request): + # Adding the @permission_classes decorator was not working so we + # explicitly invoke our custom permission class. + if not IsRegisteredAndConfirmed().has_permission(request, self): + return Response(status=status.HTTP_401_UNAUTHORIZED) + if not flag_is_active(request._request, 'can_submit_facility'): + raise PermissionDenied() + return Response(status=status.HTTP_501_NOT_IMPLEMENTED) + @transaction.atomic def destroy(self, request, pk=None): if request.user.is_anonymous: From c0a4d04a298b09c72ac0a3f07ba7a4e27c232a65 Mon Sep 17 00:00:00 2001 From: Justin Walgran Date: Fri, 25 Oct 2019 14:31:06 -0700 Subject: [PATCH 03/10] Add a serializer to validate facility create request body The single facility create endpoint requires POSTing a JSON body with `name`, `address`, and `country` fields. We reuse the existing validation function from list processing to ensure that the submitted `country` can be converted to a country code. --- src/django/api/serializers.py | 13 +++++ src/django/api/tests.py | 100 ++++++++++++++++++++++++++++++---- src/django/api/views.py | 5 ++ 3 files changed, 107 insertions(+), 11 deletions(-) diff --git a/src/django/api/serializers.py b/src/django/api/serializers.py index bf393aba0..166564683 100644 --- a/src/django/api/serializers.py +++ b/src/django/api/serializers.py @@ -33,6 +33,7 @@ ProductType, ProductionType) from api.countries import COUNTRY_NAMES, COUNTRY_CHOICES +from api.processing import get_country_code from waffle import switch_is_active @@ -538,6 +539,18 @@ def get_claim_info(self, facility): return None +class FacilityCreateBodySerializer(Serializer): + country = CharField(required=True) + name = CharField(required=True, max_length=200) + address = CharField(required=True, max_length=200) + + def validate_country(self, value): + try: + return get_country_code(value) + except ValueError as ve: + raise ValidationError(ve) + + class FacilityClaimSerializer(ModelSerializer): facility_name = SerializerMethodField() oar_id = SerializerMethodField() diff --git a/src/django/api/tests.py b/src/django/api/tests.py index 376ae79c2..4c67a51a6 100644 --- a/src/django/api/tests.py +++ b/src/django/api/tests.py @@ -31,7 +31,8 @@ geocode_address) from api.test_data import parsed_city_hall_data from api.permissions import referring_host_is_allowed, referring_host -from api.serializers import ApprovedFacilityClaimSerializer +from api.serializers import (ApprovedFacilityClaimSerializer, + FacilityCreateBodySerializer) class FacilityListCreateTest(APITestCase): @@ -4701,6 +4702,21 @@ class FacilitySubmitTest(FacilityAPITestCaseBase): def setUp(self): super(FacilitySubmitTest, self).setUp() self.url = reverse('facility-list') + self.valid_facility = { + 'country': 'United States', + 'name': 'Pants Hut', + 'address': '123 Main St, Anywhereville, PA' + } + + def join_group_and_login(self): + self.client.logout() + group = auth.models.Group.objects.get( + name='can_submit_facility', + ) + self.user.groups.set([group.id]) + self.user.save() + self.client.login(email=self.user_email, + password=self.user_password) def test_unauthenticated_receives_401(self): self.client.logout() @@ -4716,20 +4732,82 @@ def test_not_in_group_receives_403(self): self.assertEqual(response.status_code, 403) - def test_in_group_receives_200(self): - self.client.logout() + def test_empty_body_is_invalid(self): + self.join_group_and_login() + response = self.client.post(self.url) + self.assertEqual(response.status_code, 400) - group = auth.models.Group.objects.get( - name='can_submit_facility', - ) + def test_missing_fields_are_invalid(self): + self.join_group_and_login() - self.user.groups.set([group.id]) - self.user.save() + response = self.client.post(self.url, { + 'country': 'US', + 'name': 'Something', + }) + self.assertEqual(response.status_code, 400) - self.client.login(email=self.user_email, - password=self.user_password) + response = self.client.post(self.url, { + 'country': 'US', + 'address': 'Some street', + }) + self.assertEqual(response.status_code, 400) - response = self.client.post(self.url) + response = self.client.post(self.url, { + 'name': 'Something', + 'address': 'Some street', + }) + self.assertEqual(response.status_code, 400) + + def test_valid_request(self): + self.join_group_and_login() + response = self.client.post(self.url, self.valid_facility) # TODO: Change to 200 when implemented self.assertEqual(response.status_code, 501) + + +class FacilityCreateBodySerializerTest(TestCase): + def test_valid_data(self): + serializer = FacilityCreateBodySerializer(data={ + 'country': 'United States', + 'name': 'Pants Hut', + 'address': '123 Main St, Anywhereville, PA' + }) + self.assertTrue(serializer.is_valid()) + + def test_missing_fields(self): + serializer = FacilityCreateBodySerializer(data={ + 'name': 'Pants Hut', + 'address': '123 Main St, Anywhereville, PA' + }) + self.assertFalse(serializer.is_valid()) + self.assertIn('country', serializer.errors) + self.assertNotIn('name', serializer.errors) + self.assertNotIn('address', serializer.errors) + + serializer = FacilityCreateBodySerializer(data={ + 'country': 'United States', + 'address': '123 Main St, Anywhereville, PA' + }) + self.assertFalse(serializer.is_valid()) + self.assertIn('name', serializer.errors) + self.assertNotIn('country', serializer.errors) + self.assertNotIn('address', serializer.errors) + + serializer = FacilityCreateBodySerializer(data={ + 'country': 'United States', + 'name': 'Pants Hut', + }) + self.assertFalse(serializer.is_valid()) + self.assertIn('address', serializer.errors) + self.assertNotIn('country', serializer.errors) + self.assertNotIn('name', serializer.errors) + + def test_invalid_country(self): + serializer = FacilityCreateBodySerializer(data={ + 'country': 'Notrealia', + 'name': 'Pants Hut', + 'address': '123 Main St, Anywhereville, PA' + }) + self.assertFalse(serializer.is_valid()) + self.assertIn('country', serializer.errors) diff --git a/src/django/api/views.py b/src/django/api/views.py index bb192c4ac..f14155149 100644 --- a/src/django/api/views.py +++ b/src/django/api/views.py @@ -75,6 +75,7 @@ FacilityListQueryParamsSerializer, FacilitySerializer, FacilityDetailsSerializer, + FacilityCreateBodySerializer, UserSerializer, UserProfileSerializer, FacilityClaimSerializer, @@ -632,6 +633,10 @@ def create(self, request): return Response(status=status.HTTP_401_UNAUTHORIZED) if not flag_is_active(request._request, 'can_submit_facility'): raise PermissionDenied() + + body_serializer = FacilityCreateBodySerializer(data=request.data) + body_serializer.is_valid(raise_exception=True) + return Response(status=status.HTTP_501_NOT_IMPLEMENTED) @transaction.atomic From a007352db318f89c07c17c95b274aad0af11b267 Mon Sep 17 00:00:00 2001 From: Justin Walgran Date: Fri, 25 Oct 2019 15:53:29 -0700 Subject: [PATCH 04/10] Add query param serializer for facility create endpoint Handles parsing and setting the default values for the `create` and `public` boolean query string arguments. --- src/django/api/serializers.py | 6 ++++++ src/django/api/tests.py | 8 ++++++++ src/django/api/views.py | 5 +++++ 3 files changed, 19 insertions(+) diff --git a/src/django/api/serializers.py b/src/django/api/serializers.py index 166564683..408cedca6 100644 --- a/src/django/api/serializers.py +++ b/src/django/api/serializers.py @@ -12,6 +12,7 @@ EmailField, IntegerField, ListField, + BooleanField, ModelSerializer, SerializerMethodField, ValidationError, @@ -551,6 +552,11 @@ def validate_country(self, value): raise ValidationError(ve) +class FacilityCreateQueryParamsSerializer(Serializer): + create = BooleanField(default=True, required=False) + public = BooleanField(default=True, required=False) + + class FacilityClaimSerializer(ModelSerializer): facility_name = SerializerMethodField() oar_id = SerializerMethodField() diff --git a/src/django/api/tests.py b/src/django/api/tests.py index 4c67a51a6..9cdf2f402 100644 --- a/src/django/api/tests.py +++ b/src/django/api/tests.py @@ -4765,6 +4765,14 @@ def test_valid_request(self): # TODO: Change to 200 when implemented self.assertEqual(response.status_code, 501) + def test_valid_request_with_params(self): + self.join_group_and_login() + url_with_query = '{}?create=false&public=false'.format(self.url) + response = self.client.post(url_with_query, self.valid_facility) + + # TODO: Change to 200 when implemented + self.assertEqual(response.status_code, 501) + class FacilityCreateBodySerializerTest(TestCase): def test_valid_data(self): diff --git a/src/django/api/views.py b/src/django/api/views.py index f14155149..0c58fbe0b 100644 --- a/src/django/api/views.py +++ b/src/django/api/views.py @@ -76,6 +76,7 @@ FacilitySerializer, FacilityDetailsSerializer, FacilityCreateBodySerializer, + FacilityCreateQueryParamsSerializer, UserSerializer, UserProfileSerializer, FacilityClaimSerializer, @@ -637,6 +638,10 @@ def create(self, request): body_serializer = FacilityCreateBodySerializer(data=request.data) body_serializer.is_valid(raise_exception=True) + params_serializer = FacilityCreateQueryParamsSerializer( + data=request.query_params) + params_serializer.is_valid(raise_exception=True) + return Response(status=status.HTTP_501_NOT_IMPLEMENTED) @transaction.atomic From cb0e8a89801b4c97768ebbaa9ae40f5220d25081 Mon Sep 17 00:00:00 2001 From: Justin Walgran Date: Fri, 25 Oct 2019 16:31:48 -0700 Subject: [PATCH 05/10] Add constants for feature groups and facility create query params It's a best practice to use string constants rather than literals when referring to a string value in multiple places. It is safe to edit the migration since we are not changing any actual logic, just extracting the string constant. --- src/django/api/constants.py | 11 +++++++++++ .../migrations/0038_add_facility_history_flag.py | 14 +++++++++----- src/django/api/tests.py | 7 ++++--- src/django/api/views.py | 10 +++++++--- 4 files changed, 31 insertions(+), 11 deletions(-) diff --git a/src/django/api/constants.py b/src/django/api/constants.py index 8c139b93c..6ca2cdadc 100644 --- a/src/django/api/constants.py +++ b/src/django/api/constants.py @@ -40,6 +40,17 @@ class FacilityMergeQueryParams: MERGE = 'merge' +class FacilityCreateQueryParams: + CREATE = 'create' + PUBLIC = 'public' + + +class FeatureGroups: + CAN_GET_FACILITY_HISTORY = 'can_get_facility_history' + CAN_SUBMIT_FACILITY = 'can_submit_facility' + CAN_SUBMIT_PRIVATE_FACILITY = 'can_submit_private_facility' + + class FacilityHistoryActions: CREATE = 'CREATE' UPDATE = 'UPDATE' diff --git a/src/django/api/migrations/0038_add_facility_history_flag.py b/src/django/api/migrations/0038_add_facility_history_flag.py index d3acf5c7e..9b04b85d7 100644 --- a/src/django/api/migrations/0038_add_facility_history_flag.py +++ b/src/django/api/migrations/0038_add_facility_history_flag.py @@ -1,15 +1,19 @@ from django.db import migrations +from api.constants import FeatureGroups + def create_can_get_facility_history(apps, schema_editor): Group = apps.get_model('auth', 'Group') - history_group = Group.objects.create(name='can_get_facility_history') + history_group = Group.objects.create( + name=FeatureGroups.CAN_GET_FACILITY_HISTORY) Flag = apps.get_model('waffle', 'Flag') - history_flag = Flag.objects.create(name='can_get_facility_history', - superusers=True, - staff=False, - note='Used for history API endpoint authorization') + history_flag = Flag.objects.create( + name=FeatureGroups.CAN_GET_FACILITY_HISTORY, + superusers=True, + staff=False, + note='Used for history API endpoint authorization') history_flag.groups.set([history_group]) diff --git a/src/django/api/tests.py b/src/django/api/tests.py index 9cdf2f402..4f0e1efba 100644 --- a/src/django/api/tests.py +++ b/src/django/api/tests.py @@ -17,7 +17,8 @@ from api.constants import (ProcessingAction, LogDownloadQueryParams, - UpdateLocationParams) + UpdateLocationParams, + FeatureGroups) from api.models import (Facility, FacilityList, FacilityListItem, FacilityClaim, FacilityClaimReviewNote, FacilityMatch, FacilityAlias, Contributor, User, @@ -3954,7 +3955,7 @@ def setUp(self): self.facility_two.id ) - @override_flag('can_get_facility_history', active=True) + @override_flag(FeatureGroups.CAN_GET_FACILITY_HISTORY, active=True) def test_serializes_deleted_facility_history(self): delete_facility_url = '/api/facilities/{}/'.format(self.facility.id) delete_response = self.client.delete(delete_facility_url) @@ -4711,7 +4712,7 @@ def setUp(self): def join_group_and_login(self): self.client.logout() group = auth.models.Group.objects.get( - name='can_submit_facility', + name=FeatureGroups.CAN_SUBMIT_FACILITY, ) self.user.groups.set([group.id]) self.user.save() diff --git a/src/django/api/views.py b/src/django/api/views.py index 0c58fbe0b..bb4250047 100644 --- a/src/django/api/views.py +++ b/src/django/api/views.py @@ -51,9 +51,11 @@ FacilityListQueryParams, FacilityListItemsQueryParams, FacilityMergeQueryParams, + FacilityCreateQueryParams, ProcessingAction, LogDownloadQueryParams, - UpdateLocationParams) + UpdateLocationParams, + FeatureGroups) from api.models import (FacilityList, FacilityListItem, FacilityClaim, @@ -632,7 +634,8 @@ def create(self, request): # explicitly invoke our custom permission class. if not IsRegisteredAndConfirmed().has_permission(request, self): return Response(status=status.HTTP_401_UNAUTHORIZED) - if not flag_is_active(request._request, 'can_submit_facility'): + if not flag_is_active(request._request, + FeatureGroups.CAN_SUBMIT_FACILITY): raise PermissionDenied() body_serializer = FacilityCreateBodySerializer(data=request.data) @@ -1275,7 +1278,8 @@ def get_facility_history(self, request, pk=None): } ] """ - if not flag_is_active(request._request, 'can_get_facility_history'): + if not flag_is_active(request._request, + FeatureGroups.CAN_GET_FACILITY_HISTORY): raise PermissionDenied() historical_facility_queryset = Facility.history.filter(id=pk) From 213f6393044b168638f79b05c6e0650819420e06 Mon Sep 17 00:00:00 2001 From: Justin Walgran Date: Mon, 28 Oct 2019 08:15:28 -0700 Subject: [PATCH 06/10] Add migration and authorization for private facility submission User accounts must belong to the `can_submit_private_facility` group in order to submit facilities to be displayed without a specific contributor affiliation. --- .../0041_add_private_facility_flag.py | 29 +++++++++++++++++++ src/django/api/tests.py | 18 +++++++++++- src/django/api/views.py | 6 ++++ 3 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 src/django/api/migrations/0041_add_private_facility_flag.py diff --git a/src/django/api/migrations/0041_add_private_facility_flag.py b/src/django/api/migrations/0041_add_private_facility_flag.py new file mode 100644 index 000000000..6931cf6d7 --- /dev/null +++ b/src/django/api/migrations/0041_add_private_facility_flag.py @@ -0,0 +1,29 @@ +from django.db import migrations + +from api.constants import FeatureGroups + + +def create_can_submit_private_facility(apps, schema_editor): + Group = apps.get_model('auth', 'Group') + submit_private_facility_group = Group.objects.create( + name=FeatureGroups.CAN_SUBMIT_PRIVATE_FACILITY) + + Flag = apps.get_model('waffle', 'Flag') + submit_private_facility_flag = Flag.objects.create( + name=FeatureGroups.CAN_SUBMIT_PRIVATE_FACILITY, + superusers=True, + staff=False, + note='Used for private facility submission API param authorization') + + submit_private_facility_flag.groups.set([submit_private_facility_group]) + + +class Migration(migrations.Migration): + + dependencies = [ + ('api', '0040_add_submit_facility_flag'), + ] + + operations = [ + migrations.RunPython(create_can_submit_private_facility), + ] diff --git a/src/django/api/tests.py b/src/django/api/tests.py index 4f0e1efba..d1f64543f 100644 --- a/src/django/api/tests.py +++ b/src/django/api/tests.py @@ -4768,12 +4768,28 @@ def test_valid_request(self): def test_valid_request_with_params(self): self.join_group_and_login() - url_with_query = '{}?create=false&public=false'.format(self.url) + url_with_query = '{}?create=false&public=true'.format(self.url) response = self.client.post(url_with_query, self.valid_facility) # TODO: Change to 200 when implemented self.assertEqual(response.status_code, 501) + def test_private_permission(self): + self.join_group_and_login() + url_with_query = '{}?public=false'.format(self.url) + response = self.client.post(url_with_query, self.valid_facility) + self.assertEqual(response.status_code, 403) + + group = auth.models.Group.objects.get( + name=FeatureGroups.CAN_SUBMIT_PRIVATE_FACILITY, + ) + self.user.groups.add(group.id) + self.user.save() + + response = self.client.post(url_with_query, self.valid_facility) + # TODO: Change to 200 when implemented + self.assertEqual(response.status_code, 501) + class FacilityCreateBodySerializerTest(TestCase): def test_valid_data(self): diff --git a/src/django/api/views.py b/src/django/api/views.py index bb4250047..495216a49 100644 --- a/src/django/api/views.py +++ b/src/django/api/views.py @@ -644,6 +644,12 @@ def create(self, request): params_serializer = FacilityCreateQueryParamsSerializer( data=request.query_params) params_serializer.is_valid(raise_exception=True) + public_submission = params_serializer.validated_data[ + FacilityCreateQueryParams.PUBLIC] + private_allowed = flag_is_active( + request._request, FeatureGroups.CAN_SUBMIT_PRIVATE_FACILITY) + if not public_submission and not private_allowed: + raise PermissionDenied('Cannot submit a private facility') return Response(status=status.HTTP_501_NOT_IMPLEMENTED) From 4796c11e3095b83fb925b92f185b91705881eecf Mon Sep 17 00:00:00 2001 From: Justin Walgran Date: Tue, 29 Oct 2019 12:50:19 -0700 Subject: [PATCH 07/10] Extract match methods to new module / refactor for fast single match In order to support quickly matching individual list items we needed a way of holding a trained and indexed model in memory and keeping it updated as the facility data changed. The new `GazetteerCache` class implements this by holding a trained model in a class variable and reading from the `HistoricalFacility` table to determine whether items need to be added or removed from the index. The `GazetteerCache` includes a threading lock to ensure that only one thread can ever be updating the cached gazetteer. Although the management command-based matching uses the new `GazetteerCache` the actual behavior has not changed, since starting a process from scratch will result in training a new model, as we have always done. As part of this refactor we have also addressed a problem with single item CSV uploads causing the match process run out of memory. We address this by changing the way models are trained. Instead of using the input file, we use 20% of the submitted list items as our "dirty" data. --- .../api/management/commands/batch_process.py | 2 +- src/django/api/matching.py | 421 ++++++++++++++++++ src/django/api/processing.py | 159 +------ src/django/api/tests.py | 6 +- 4 files changed, 426 insertions(+), 162 deletions(-) create mode 100644 src/django/api/matching.py diff --git a/src/django/api/management/commands/batch_process.py b/src/django/api/management/commands/batch_process.py index 2cc4be5ab..3213543c3 100644 --- a/src/django/api/management/commands/batch_process.py +++ b/src/django/api/management/commands/batch_process.py @@ -6,9 +6,9 @@ from api.constants import ProcessingAction from api.models import FacilityList, FacilityListItem +from api.matching import match_facility_list_items from api.processing import (parse_facility_list_item, geocode_facility_list_item, - match_facility_list_items, save_match_details) LINE_ITEM_ACTIONS = { diff --git a/src/django/api/matching.py b/src/django/api/matching.py new file mode 100644 index 000000000..f9df7db15 --- /dev/null +++ b/src/django/api/matching.py @@ -0,0 +1,421 @@ +import dedupe +import logging +import os +import re +import threading + +from collections import defaultdict +from datetime import datetime +from django.conf import settings +from django.db import transaction +from django.db.models import Q, Max +from unidecode import unidecode + +from api.models import (Facility, + FacilityList, + FacilityListItem, + HistoricalFacility) + +logger = logging.getLogger(__name__) + + +def clean(column): + """ + Remove punctuation and excess whitespace from a value before using it to + find matches. This should be the same function used when developing the + training data read from training.json as part of train_gazetteer. + """ + column = unidecode(column) + column = re.sub('\n', ' ', column) + column = re.sub('-', '', column) + column = re.sub('/', ' ', column) + column = re.sub("'", '', column) + column = re.sub(",", '', column) + column = re.sub(":", ' ', column) + column = re.sub(' +', ' ', column) + column = column.strip().strip('"').strip("'").lower().strip() + if not column: + column = None + return column + + +def get_canonical_items(): + """ + Fetch all `Facility` items and create a dictionary suitable for use by a + Dedupe model. + + Returns: + A dictionary. The key is the `Facility` OAR ID. The value is a dictionary + of clean field values keyed by field name (country, name, address). A + "clean" value is one which has been passed through the `clean` function. + """ + facility_set = Facility.objects.all().extra( + select={'country': 'country_code'}).values( + 'id', 'country', 'name', 'address') + return {str(i['id']): {k: clean(i[k]) for k in i if k != 'id'} + for i in facility_set} + + +def get_messy_items_from_facility_list(facility_list): + """ + Fetch all `FacilityListItem` objects that belong to the specified + `FacilityList` and create a dictionary suitable for use by a Dedupe model. + + Arguments: + facility_list -- A `FacilityList`. + + Returns: + A dictionary. The key is the `FacilityListItem` ID. The value is a + dictionary of clean field values keyed by field name (country, name, + address). A "clean" value is one which has been passed through the `clean` + function. + """ + facility_list_item_set = facility_list.source.facilitylistitem_set.filter( + Q(status=FacilityListItem.GEOCODED) + | Q(status=FacilityListItem.GEOCODED_NO_RESULTS)).extra( + select={'country': 'country_code'}).values( + 'id', 'country', 'name', 'address') + return {str(i['id']): {k: clean(i[k]) for k in i if k != 'id'} + for i in facility_list_item_set} + + +def get_messy_items_for_training(mod_factor=5): + """ + Fetch a subset of `FacilityListItem` objects that have been parsed and are + not in an error state. + + Arguments: + mod_factor -- Used to partition a subset of `FacilityListItem` records. The + larger the value, the fewer records will be contained in the + subset. + + Returns: + A dictionary. The key is the `FacilityListItem` ID. The value is a + dictionary of clean field values keyed by field name (country, name, + address). A "clean" value is one which has been passed through the `clean` + function. + """ + facility_list_item_set = FacilityListItem.objects.exclude( + Q(status=FacilityListItem.UPLOADED) + | Q(status=FacilityListItem.ERROR) + | Q(status=FacilityListItem.ERROR_PARSING) + | Q(status=FacilityListItem.ERROR_GEOCODING) + | Q(status=FacilityListItem.ERROR_MATCHING) + ).extra( + select={'country': 'country_code'}).values( + 'id', 'country', 'name', 'address') + records = [record for (i, record) in enumerate(facility_list_item_set) + if i % mod_factor == 0] + return {str(i['id']): {k: clean(i[k]) for k in i if k != 'id'} + for i in records} + + +def train_gazetteer(messy, canonical, model_settings=None, should_index=False): + """ + Train and return a dedupe.Gazetteer using the specified messy and canonical + dictionaries. The messy and canonical objects should have the same + structure: + - The key is a unique ID + - The value is another dictionary of field:value pairs. This dictionary + must contain at least 'country', 'name', and 'address' keys. + + Reads a training.json file containing positive and negative matches. + """ + if model_settings: + gazetteer = dedupe.StaticGazetteer(model_settings) + else: + fields = [ + {'field': 'country', 'type': 'Exact'}, + {'field': 'name', 'type': 'String'}, + {'field': 'address', 'type': 'String'}, + ] + + gazetteer = dedupe.Gazetteer(fields) + gazetteer.sample(messy, canonical, 15000) + training_file = os.path.join(settings.BASE_DIR, 'api', 'data', + 'training.json') + with open(training_file) as tf: + gazetteer.readTraining(tf) + gazetteer.train() + gazetteer.cleanupTraining() + + if should_index: + index_start = datetime.now() + logger.info('Indexing started') + gazetteer.index(canonical) + index_duration = datetime.now() - index_start + logger.info('Indexing finished ({})'.format(index_duration)) + logger.info('Cleanup training') + + return gazetteer + + +class MatchDefaults: + AUTOMATIC_THRESHOLD = 0.8 + GAZETTEER_THRESHOLD = 0.5 + RECALL_WEIGHT = 1.0 + + +class NoCanonicalRecordsError(Exception): + pass + + +def match_items(messy, + automatic_threshold=MatchDefaults.AUTOMATIC_THRESHOLD, + gazetteer_threshold=MatchDefaults.GAZETTEER_THRESHOLD, + recall_weight=MatchDefaults.RECALL_WEIGHT): + """ + Attempt to match each of the "messy" items specified with a "canonical" + item. + + This function reads from but does not update the database. + + When an argument description mentions a "clean" value it is referring to a + value that has been passed through the `clean` function. + + Arguments: + messy -- A dictionary. The key is the unique identifier of each item to be + matched. The value is a dictionary of clean field values keyed by + field name (country, name, address). + automatic_threshold -- A number from 0.0 to 1.0. A match with a confidence + score greater than this value will be assigned + automatically. + gazetteer_threshold -- A number from 0.0 to 1.0. A match with a confidence + score between this value and the + `automatic_threshold` will be considers a match that + requires confirmation. + recall_weight -- Sets the tradeoff between precision and recall. A value of + 1.0 give an equal weight to precision and recall. + https://en.wikipedia.org/wiki/Precision_and_recall + https://docs.dedupe.io/en/latest/Choosing-a-good-threshold.html + + Returns: + An dict containing the results of the matching process and contains the + following keys: + + processed_list_item_ids -- A list of all the keys in `messy` that were + considered for matching. + item_matches -- A dictionary where the keys are `messy` keys and the values + are lists of tuples where the first element is a key from + `canonical` representing an item that is a potential match + and the second element is the confidence score of the + match. + results -- A dictionary containing additional information about the + matching process that pertains to all the `messy` items and + contains the following keys: + gazetteer_threshold -- The threshold computed from the trained model + automatic_threshold -- The value of the automatic_threshold parameter + returned for convenience + recall_weight -- The value of the recall_weight parameter returned for + convenience. + code_version -- The value of the GIT_COMMIT setting. + started -- The date and time at which the training and matching was + started. + finished -- The date and time at which the training and matching was + finished. + """ + started = str(datetime.utcnow()) + if len(messy.keys()) > 0: + no_geocoded_items = False + try: + gazetteer = GazetteerCache.get_latest() + gazetteer.threshold(messy, recall_weight=recall_weight) + results = gazetteer.match(messy, threshold=gazetteer_threshold, + n_matches=None, generator=True) + no_gazetteer_matches = False + except NoCanonicalRecordsError: + results = [] + no_gazetteer_matches = True + except dedupe.core.BlockingError: + results = [] + no_gazetteer_matches = True + else: + results = [] + no_gazetteer_matches = Facility.objects.count() == 0 + no_geocoded_items = len(messy.keys()) == 0 + + finished = str(datetime.utcnow()) + + item_matches = defaultdict(list) + for matches in results: + for (messy_id, canon_id), score in matches: + item_matches[messy_id].append((canon_id, score)) + + return { + 'processed_list_item_ids': list(messy.keys()), + 'item_matches': item_matches, + 'results': { + 'no_gazetteer_matches': no_gazetteer_matches, + 'no_geocoded_items': no_geocoded_items, + 'gazetteer_threshold': gazetteer_threshold, + 'automatic_threshold': automatic_threshold, + 'recall_weight': recall_weight, + 'code_version': settings.GIT_COMMIT + }, + 'started': started, + 'finished': finished + } + + +def match_facility_list_items( + facility_list, + automatic_threshold=MatchDefaults.AUTOMATIC_THRESHOLD, + gazetteer_threshold=MatchDefaults.GAZETTEER_THRESHOLD, + recall_weight=MatchDefaults.RECALL_WEIGHT): + """ + Fetch items from the specified `FacilityList` and match them to the current + list of facilities. + + Arguments: + facility_list -- A FacilityList instance + automatic_threshold -- A number from 0.0 to 1.0. A match with a confidence + score greater than this value will be assigned + automatically. + gazetteer_threshold -- A number from 0.0 to 1.0. A match with a confidence + score between this value and the + `automatic_threshold` will be considers a match that + requires confirmation. + recall_weight -- Sets the tradeoff between precision and recall. A value of + 1.0 give an equal weight to precision and recall. + https://en.wikipedia.org/wiki/Precision_and_recall + https://docs.dedupe.io/en/latest/Choosing-a-good-threshold.html + + Returns: + See `match_items`. + + """ + if type(facility_list) != FacilityList: + raise ValueError('Argument must be a FacilityList') + + return match_items(get_messy_items_from_facility_list(facility_list), + automatic_threshold=automatic_threshold, + gazetteer_threshold=gazetteer_threshold, + recall_weight=recall_weight) + + +def match_item(country, + name, + address, + id='id', + automatic_threshold=MatchDefaults.AUTOMATIC_THRESHOLD, + gazetteer_threshold=MatchDefaults.GAZETTEER_THRESHOLD, + recall_weight=MatchDefaults.RECALL_WEIGHT): + """ + Match the details of a single facility to the list of existing facilities. + + Arguments: + country -- A valid country name or 2-character ISO code. + name -- The name of the facility. + address -- The address of the facility. + id -- The key value in the returned match results. + automatic_threshold -- A number from 0.0 to 1.0. A match with a confidence + score greater than this value will be assigned + automatically. + gazetteer_threshold -- A number from 0.0 to 1.0. A match with a confidence + score between this value and the + `automatic_threshold` will be considers a match that + requires confirmation. + recall_weight -- Sets the tradeoff between precision and recall. A value of + 1.0 give an equal weight to precision and recall. + https://en.wikipedia.org/wiki/Precision_and_recall + https://docs.dedupe.io/en/latest/Choosing-a-good-threshold.html + + Returns: + See `match_items`. + """ + return match_items( + { + str(id): { + "country": clean(country), + "name": clean(name), + "address": clean(address) + } + }, + automatic_threshold=automatic_threshold, + gazetteer_threshold=gazetteer_threshold, + recall_weight=recall_weight) + + +def facility_to_dedupe_record(facility): + """ + Convert a `Facility` into a dictionary suitable for training and indexing a + Dedupe model. + + Arguments: + facility -- A `Facility` object. + + Returns: + A dictionary with the `Facility` id as the key and a dictionary of fields + as the value. + """ + return { + str(facility.id): { + "country": clean(facility.country_code), + "name": clean(facility.name), + "address": clean(facility.address), + } + } + + +class GazetteerCache: + """ + A container for holding a single, trained and indexed Gazetteer in memory, + which is updated with any `Facility` rows that have been added, updated, or + removed since the previous call to the `get_latest` class method. + + Note that the first time `get_latest` is called it will be slow, as it + needs to train a model and index it with all the `Facility` items. + """ + _lock = threading.Lock() + _gazetter = None + _version = None + + @classmethod + @transaction.atomic + def get_latest(cls): + with cls._lock: + db_version = HistoricalFacility.objects.aggregate( + max_id=Max('history_id')).get('max_id') + if cls._gazetter is None: + canonical = get_canonical_items() + if len(canonical.keys()) == 0: + raise NoCanonicalRecordsError() + cls._gazetter = train_gazetteer(get_messy_items_for_training(), + canonical, + should_index=True) + cls._version = db_version + if db_version != cls._version: + if cls._version is None: + last_version_id = 0 + else: + last_version_id = cls._version + changes = HistoricalFacility \ + .objects \ + .filter(history_id__gt=last_version_id) \ + .order_by('history_id') \ + .extra(select={'country': 'country_code'}) \ + .values('id', 'country', 'name', 'address', + 'history_type') + for item in changes: + if item['history_type'] == '-': + cls._gazetter.unindex({ + item['id']: { + 'country': clean(item['country']), + 'name': clean(item['name']), + 'address': clean(item['address']), + } + }) + else: + # The history record has old field values, so we + # fetch the latest version + try: + facility = Facility.objects.get(id=item['id']) + cls._gazetter.index( + facility_to_dedupe_record(facility)) + except Facility.DoesNotExist: + # If the facility no longer exists, just skip + # indexing. + pass + cls._version = db_version + + return cls._gazetter diff --git a/src/django/api/processing.py b/src/django/api/processing.py index 5cab622de..7a4da9307 100644 --- a/src/django/api/processing.py +++ b/src/django/api/processing.py @@ -1,23 +1,17 @@ import csv -import os import traceback -import re import sys -import dedupe import xlrd -from collections import defaultdict from datetime import datetime -from unidecode import unidecode from django.conf import settings from django.contrib.gis.geos import Point from django.core.exceptions import ValidationError -from django.db.models import Q from api.constants import CsvHeaderField, ProcessingAction -from api.models import Facility, FacilityMatch, FacilityList, FacilityListItem +from api.models import Facility, FacilityMatch, FacilityListItem from api.countries import COUNTRY_CODES, COUNTRY_NAMES from api.geocoding import geocode_address @@ -217,157 +211,6 @@ def geocode_facility_list_item(item): }) -def clean(column): - """ - Remove punctuation and excess whitespace from a value before using it to - find matches. This should be the same function used when developing the - training data read from training.json as part of train_gazetteer. - """ - column = unidecode(column) - column = re.sub('\n', ' ', column) - column = re.sub('-', '', column) - column = re.sub('/', ' ', column) - column = re.sub("'", '', column) - column = re.sub(",", '', column) - column = re.sub(":", ' ', column) - column = re.sub(' +', ' ', column) - column = column.strip().strip('"').strip("'").lower().strip() - if not column: - column = None - return column - - -def train_gazetteer(messy, canonical): - """ - Train and return a dedupe.Gazetteer using the specified messy and canonical - dictionaries. The messy and canonical objects should have the same - structure: - - The key is a unique ID - - The value is another dictionary of field:value pairs. This dictionary - must contain at least 'country', 'name', and 'address' keys. - - Reads a training.json file containing positive and negative matches. - """ - fields = [ - {'field': 'country', 'type': 'Exact'}, - {'field': 'name', 'type': 'String'}, - {'field': 'address', 'type': 'String'}, - ] - - gazetteer = dedupe.Gazetteer(fields) - gazetteer.sample(messy, canonical, 15000) - training_file = os.path.join(settings.BASE_DIR, 'api', 'data', - 'training.json') - with open(training_file) as tf: - gazetteer.readTraining(tf) - gazetteer.train() - gazetteer.index(canonical) - gazetteer.cleanupTraining() - # The gazetteer example in the dedupeio/dedupe-examples repository called - # index both after training and after calling cleanupTraining. - gazetteer.index(canonical) - return gazetteer - - -def match_facility_list_items(facility_list, automatic_threshold=0.8, - gazetteer_threshold=0.5, recall_weight=1.0): - """ - Attempt to match all the items in the specified FacilityList that are in - the GEOCODED status with existing facilities. - - This function reads from but does not update the database. - - Arguments: - facility_list -- A FacilityList instance - automatic_threshold -- A number from 0.0 to 1.0. A match with a confidence - score greater than this value will be assigned - automatically. - recall_weight -- Sets the tradeoff between precision and recall. A value of - 1.0 give an equal weight to precision and recall. - https://en.wikipedia.org/wiki/Precision_and_recall - https://docs.dedupe.io/en/latest/Choosing-a-good-threshold.html - - Returns: - An dict containing the results of the matching process and contains the - following keys: - - processed_list_item_ids -- A list of all the FacilityListItem IDs that were - considered for matching. - item_matches -- A dictionary where the keys are FacilityListItem IDs and - the values are lists of tuples where the first element is - the ID of a Facility that is a potential match and the - second element is the confidence score of the match. - results -- A dictionary containing additional information about the - matching process that pertains to all the list items and - contains the following keys: - gazetteer_threshold -- The threshold computed from the trained model - automatic_threshold -- The value of the automatic_threshold parameter - returned for convenience - recall_weight -- The value of the recall_weight parameter returned for - convenience. - code_version -- The value of the GIT_COMMIT setting. - started -- The date and time at which the training and matching was - started. - finished -- The date and time at which the training and matching was - finished. - """ - started = str(datetime.utcnow()) - if type(facility_list) != FacilityList: - raise ValueError('Argument must be a FacilityList') - - facility_set = Facility.objects.all().extra( - select={'country': 'country_code'}).values( - 'id', 'country', 'name', 'address') - canonical = {str(i['id']): {k: clean(i[k]) for k in i if k != 'id'} - for i in facility_set} - - facility_list_item_set = facility_list.source.facilitylistitem_set.filter( - Q(status=FacilityListItem.GEOCODED) - | Q(status=FacilityListItem.GEOCODED_NO_RESULTS)).extra( - select={'country': 'country_code'}).values( - 'id', 'country', 'name', 'address') - messy = {str(i['id']): {k: clean(i[k]) for k in i if k != 'id'} - for i in facility_list_item_set} - - if len(canonical.keys()) > 0 and len(messy.keys()) > 0: - no_geocoded_items = False - gazetteer = train_gazetteer(messy, canonical) - try: - gazetteer.threshold(messy, recall_weight=recall_weight) - results = gazetteer.match(messy, threshold=gazetteer_threshold, - n_matches=None, generator=True) - no_gazetteer_matches = False - except dedupe.core.BlockingError: - results = [] - no_gazetteer_matches = True - else: - results = [] - no_gazetteer_matches = len(canonical.keys()) == 0 - no_geocoded_items = len(messy.keys()) == 0 - - finished = str(datetime.utcnow()) - - item_matches = defaultdict(list) - for matches in results: - for (messy_id, canon_id), score in matches: - item_matches[messy_id].append((canon_id, score)) - - return { - 'processed_list_item_ids': list(messy.keys()), - 'item_matches': item_matches, - 'results': { - 'no_gazetteer_matches': no_gazetteer_matches, - 'no_geocoded_items': no_geocoded_items, - 'gazetteer_threshold': gazetteer_threshold, - 'automatic_threshold': automatic_threshold, - 'recall_weight': recall_weight, - 'code_version': settings.GIT_COMMIT - }, - 'started': started, - 'finished': finished - } - - def save_match_details(match_results): """ Save the results of a call to match_facility_list_items by creating diff --git a/src/django/api/tests.py b/src/django/api/tests.py index d1f64543f..4f7253eb7 100644 --- a/src/django/api/tests.py +++ b/src/django/api/tests.py @@ -24,9 +24,9 @@ FacilityMatch, FacilityAlias, Contributor, User, RequestLog, DownloadLog, FacilityLocation, Source) from api.oar_id import make_oar_id, validate_oar_id +from api.matching import match_facility_list_items from api.processing import (parse_facility_list_item, - geocode_facility_list_item, - match_facility_list_items) + geocode_facility_list_item) from api.geocoding import (create_geocoding_params, format_geocoded_address_data, geocode_address) @@ -1131,7 +1131,7 @@ def interspace(string): def junk_chars(string): - return string + 'YY' + return 'AA' + string + 'YY' class DedupeMatchingTests(TestCase): From 44cd9c23e4dea8757bdcab79d781afcc97095adb Mon Sep 17 00:00:00 2001 From: Justin Walgran Date: Tue, 29 Oct 2019 16:12:04 -0700 Subject: [PATCH 08/10] Implement single-item matching This commit adds the matching and persistence logic to the view function. The flow of the view is based on the batch processing pipeline, but all three steps are completed at once. Each stage is wrapped in an individual try block so we can log unexpected failures similar to the batch process. The "parse" step is combined with the saving of the initial `Source` and `FacilityListItem` records, since the data fields are submitted "pre-parsed." We create `Source` and `FacilityListItem` objects even when the user passes the `create=false` option because we still want to track the submission of the data in the `Source` table and the performance of the matching in the `FacilityListItem.processing_results` field. We were able to reuse the `save_match_details` function used by the batch processing. The only change we needed to make was to wrap the model save code with a check of the `Source.create` property. We needed to adjust the facility delete method because submitting high-confidence matches with `create=false` was creating `FacilityListItem` rows with foreign key references to the `Facility`, but not a corresponding `FacilityMatch` row. --- src/django/api/processing.py | 32 +++--- src/django/api/tests.py | 11 +-- src/django/api/views.py | 183 ++++++++++++++++++++++++++++++++--- 3 files changed, 190 insertions(+), 36 deletions(-) diff --git a/src/django/api/processing.py b/src/django/api/processing.py index 7a4da9307..c31342d84 100644 --- a/src/django/api/processing.py +++ b/src/django/api/processing.py @@ -270,8 +270,9 @@ def make_pending_match(item_id, facility_id, score): }) item.save() - for m in matches: - m.save() + if item.source.create: + for m in matches: + m.save() unmatched = (FacilityListItem.objects .filter(id__in=processed_list_item_ids) @@ -288,19 +289,20 @@ def make_pending_match(item_id, facility_id, score): 'finished_at': finished }) else: - facility = Facility(name=item.name, - address=item.address, - country_code=item.country_code, - location=item.geocoded_point, - created_from=item) - facility.save() - - match = make_pending_match(item.id, facility.id, 1.0) - match.results['match_type'] = 'no_gazetteer_match' - match.status = FacilityMatch.AUTOMATIC - match.save() - - item.facility = facility + if item.source.create: + facility = Facility(name=item.name, + address=item.address, + country_code=item.country_code, + location=item.geocoded_point, + created_from=item) + facility.save() + + match = make_pending_match(item.id, facility.id, 1.0) + match.results['match_type'] = 'no_gazetteer_match' + match.status = FacilityMatch.AUTOMATIC + match.save() + + item.facility = facility item.status = FacilityListItem.MATCHED item.processing_results.append({ 'action': ProcessingAction.MATCH, diff --git a/src/django/api/tests.py b/src/django/api/tests.py index 4f7253eb7..450b9ea5c 100644 --- a/src/django/api/tests.py +++ b/src/django/api/tests.py @@ -4762,17 +4762,13 @@ def test_missing_fields_are_invalid(self): def test_valid_request(self): self.join_group_and_login() response = self.client.post(self.url, self.valid_facility) - - # TODO: Change to 200 when implemented - self.assertEqual(response.status_code, 501) + self.assertEqual(response.status_code, 201) def test_valid_request_with_params(self): self.join_group_and_login() url_with_query = '{}?create=false&public=true'.format(self.url) response = self.client.post(url_with_query, self.valid_facility) - - # TODO: Change to 200 when implemented - self.assertEqual(response.status_code, 501) + self.assertEqual(response.status_code, 200) def test_private_permission(self): self.join_group_and_login() @@ -4787,8 +4783,7 @@ def test_private_permission(self): self.user.save() response = self.client.post(url_with_query, self.valid_facility) - # TODO: Change to 200 when implemented - self.assertEqual(response.status_code, 501) + self.assertEqual(response.status_code, 201) class FacilityCreateBodySerializerTest(TestCase): diff --git a/src/django/api/views.py b/src/django/api/views.py index 495216a49..7cdcfa835 100644 --- a/src/django/api/views.py +++ b/src/django/api/views.py @@ -1,6 +1,7 @@ import operator import os import sys +import traceback from datetime import datetime from functools import reduce @@ -56,6 +57,8 @@ LogDownloadQueryParams, UpdateLocationParams, FeatureGroups) +from api.geocoding import geocode_address +from api.matching import match_item from api.models import (FacilityList, FacilityListItem, FacilityClaim, @@ -69,7 +72,11 @@ Version, FacilityLocation, Source) -from api.processing import parse_csv_line, parse_csv, parse_excel +from api.processing import (parse_csv_line, + parse_csv, + parse_excel, + get_country_code, + save_match_details) from api.serializers import (FacilityListSerializer, FacilityListItemSerializer, FacilityListItemsQueryParamsSerializer, @@ -644,6 +651,8 @@ def create(self, request): params_serializer = FacilityCreateQueryParamsSerializer( data=request.query_params) params_serializer.is_valid(raise_exception=True) + should_create = params_serializer.validated_data[ + FacilityCreateQueryParams.CREATE] public_submission = params_serializer.validated_data[ FacilityCreateQueryParams.PUBLIC] private_allowed = flag_is_active( @@ -651,7 +660,151 @@ def create(self, request): if not public_submission and not private_allowed: raise PermissionDenied('Cannot submit a private facility') - return Response(status=status.HTTP_501_NOT_IMPLEMENTED) + parse_started = str(datetime.utcnow()) + + source = Source.objects.create( + contributor=request.user.contributor, + source_type=Source.SINGLE, + is_public=public_submission, + create=should_create + ) + + country_code = get_country_code( + body_serializer.validated_data.get('country')) + name = body_serializer.validated_data.get('name') + address = body_serializer.validated_data.get('address') + + item = FacilityListItem.objects.create( + source=source, + row_index=0, + raw_data=request.data, + status=FacilityListItem.PARSED, + name=name, + address=address, + country_code=country_code, + processing_results=[{ + 'action': ProcessingAction.PARSE, + 'started_at': parse_started, + 'error': False, + 'finished_at': str(datetime.utcnow()), + 'is_geocoded': False, + }] + ) + + result = { + 'matches': [], + 'item_id': item.id, + 'geocoded_geometry': None, + 'geocoded_address': None, + 'status': item.status, + } + + geocode_started = str(datetime.utcnow()) + try: + geocode_result = geocode_address(address, country_code) + if geocode_result['result_count'] > 0: + item.status = FacilityListItem.GEOCODED + item.geocoded_point = Point( + geocode_result["geocoded_point"]["lng"], + geocode_result["geocoded_point"]["lat"] + ) + item.geocoded_address = geocode_result["geocoded_address"] + + result['geocoded_geometry'] = { + 'type': 'Point', + 'coordinates': [ + geocode_result["geocoded_point"]["lng"], + geocode_result["geocoded_point"]["lat"], + ] + } + result['geocoded_address'] = item.geocoded_address + else: + item.status = FacilityListItem.GEOCODED_NO_RESULTS + + item.processing_results.append({ + 'action': ProcessingAction.GEOCODE, + 'started_at': geocode_started, + 'error': False, + 'skipped_geocoder': False, + 'data': geocode_result['full_response'], + 'finished_at': str(datetime.utcnow()), + }) + + item.save() + except Exception as e: + item.status = FacilityListItem.ERROR_GEOCODING + item.processing_results.append({ + 'action': ProcessingAction.GEOCODE, + 'started_at': geocode_started, + 'error': True, + 'message': str(e), + 'trace': traceback.format_exc(), + 'finished_at': str(datetime.utcnow()), + }) + item.save() + result['status'] = item.status + return Response(result, + status=status.HTTP_500_INTERNAL_SERVER_ERROR) + + match_started = str(datetime.utcnow()) + try: + match_results = match_item(country_code, name, address, item.id) + save_match_details(match_results) + + automatic_threshold = \ + match_results['results']['automatic_threshold'] + + item_matches = match_results['item_matches'] + for item_id, matches in item_matches.items(): + result['item_id'] = item_id + result['status'] = item.status + for facility_id, score in matches: + facility = Facility.objects.get(id=facility_id) + facility_dict = FacilityDetailsSerializer(facility).data + # calling `round` alone was not trimming digits + facility_dict['confidence'] = float(str(round(score, 4))) + if score < automatic_threshold: + if should_create: + # TODO: Move match confirmation out of the + # FacilityListViewSet and set URLs here + facility_dict['confirm_match_url'] = None + facility_dict['reject_match_url'] = None + result['matches'].append(facility_dict) + except Exception as e: + item.status = FacilityListItem.ERROR_MATCHING + item.processing_results.append({ + 'action': ProcessingAction.MATCH, + 'started_at': match_started, + 'error': True, + 'message': str(e), + 'finished_at': str(datetime.utcnow()) + }) + item.save() + result['status'] = item.status + return Response(result, + status=status.HTTP_500_INTERNAL_SERVER_ERROR) + + item.refresh_from_db() + result['item_id'] = item.id + result['status'] = item.status + if item.facility is not None: + result['oar_id'] = item.facility.id + if item.facility.created_from == item: + result['status'] = FacilityListItem.NEW_FACILITY + elif (item.status == FacilityListItem.MATCHED + and len(item_matches.keys()) == 0): + # This branch handles the case where they client has specified + # `create=false` but there are no matches, which means that we + # would have attempted to create a new `Facility`. + if item.geocoded_point is None: + result['status'] = FacilityListItem.ERROR_MATCHING + else: + result['status'] = FacilityListItem.NEW_FACILITY + + if should_create and result['status'] != FacilityListItem.ERROR_MATCHING: # noqa + return Response(result, status=status.HTTP_201_CREATED) + else: + return Response(result, status=status.HTTP_200_OK) @transaction.atomic def destroy(self, request, pk=None): @@ -671,17 +824,21 @@ def destroy(self, request, pk=None): ) now = str(datetime.utcnow()) - list_item = facility.created_from - list_item.status = FacilityListItem.DELETED - list_item.processing_results.append({ - 'action': ProcessingAction.DELETE_FACILITY, - 'started_at': now, - 'error': False, - 'finished_at': now, - 'deleted_oar_id': facility.id, - }) - list_item.facility = None - list_item.save() + list_items = FacilityListItem \ + .objects \ + .filter(facility=facility) \ + .filter(Q(source__create=False) | Q(id=facility.created_from.id)) + for list_item in list_items: + list_item.status = FacilityListItem.DELETED + list_item.processing_results.append({ + 'action': ProcessingAction.DELETE_FACILITY, + 'started_at': now, + 'error': False, + 'finished_at': now, + 'deleted_oar_id': facility.id, + }) + list_item.facility = None + list_item.save() match = facility.get_created_from_match() match.changeReason = 'Deleted {}'.format(facility.id) From e59ef7dfecd103aa7bf3f0b4bc993db4dc40657c Mon Sep 17 00:00:00 2001 From: Justin Walgran Date: Wed, 30 Oct 2019 08:18:01 -0700 Subject: [PATCH 09/10] Update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9097c7c61..f6f95099d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +- Add single facility submission endpoint [#896](https://github.com/open-apparel-registry/open-apparel-registry/pull/896) ### Changed - Replace facility history feature switch with feature flag [#881](https://github.com/open-apparel-registry/open-apparel-registry/pull/881) From 6dfda0c1ceece0daf284b9ef3b89b62256af2d1f Mon Sep 17 00:00:00 2001 From: Justin Walgran Date: Wed, 30 Oct 2019 13:51:30 -0700 Subject: [PATCH 10/10] Add swagger documentation for single-facility match endpoint This adds some example request and response data and documents the available query parameters. There are known issues: - Requests made with the "Try it out" form return a CSRF error. - The "data" parameter has an `undefined` data type. It was unclear how to properly define the data type within the `coreapi.Field` object. - The return status code is listed as 201, but this endpoint does not always create objects and sometimes returns 200. It was unclear how to change the response section. --- src/django/api/views.py | 189 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 189 insertions(+) diff --git a/src/django/api/views.py b/src/django/api/views.py index 7cdcfa835..d8a8dfa4b 100644 --- a/src/django/api/views.py +++ b/src/django/api/views.py @@ -497,6 +497,29 @@ def get_schema_fields(self, view): ), ] + if view.action == 'create': + return [ + coreapi.Field( + name='create', + location='query', + type='boolean', + required=False, + description=( + 'If false, match results will be returned, but a new ' + 'facility or facility match will not be saved'), + ), + coreapi.Field( + name='public', + location='query', + type='boolean', + required=False, + description=( + 'If false and a new facility or facility match is ' + 'created, the contributor will not be publicly ' + 'associated with the facility'), + ), + ] + return [] @@ -522,6 +545,24 @@ def get_link(self, path, method, base_url): return super(FacilitiesAutoSchema, self).get_link( path, method, base_url) + def _allows_filters(self, path, method): + return True + + def get_serializer_fields(self, path, method): + if method == 'POST': + return [ + coreapi.Field( + name='data', + location='body', + description=( + 'The country, name, and address of the facility. See ' + 'the sample request body above.'), + required=True, + ) + ] + + return [] + @schema(FacilitiesAutoSchema()) class FacilitiesViewSet(mixins.ListModelMixin, @@ -637,6 +678,154 @@ def retrieve(self, request, pk=None): @transaction.atomic def create(self, request): + """ + Matches submitted facility details to the full list of facilities. + By default, creates a new facility if there is no match, or associates + the authenticated contributor with the facility if there is a confident + match. + + **NOTE** The form below lists the return status code as 201. When + POSTing data with `create=false` the return status will be 200, not + 201. + + ## Sample Request Body + + { + "country": "China", + "name": "Nantong Jackbeanie Headwear & Garment Co. Ltd.", + "address": "No.808,the third industry park,Guoyuan Town,Nantong 226500." + } + + ## Sample Responses + + ### Automatic Match + + { + "matches": [ + { + "id": "CN2019303BQ3FZP", + "type": "Feature", + "geometry": { + "type": "Point", + "coordinates": [ + 120.596047, + 32.172013 + ] + }, + "properties": { + "name": "Nantong Jackbeanie Headwear Garment Co. Ltd.", + "address": "No. 808, The Third Industry Park, Guoyuan Town, Rugao City Nantong", + "country_code": "CN", + "oar_id": "CN2019303BQ3FZP", + "other_names": [], + "other_addresses": [], + "contributors": [ + { + "id": 4, + "name": "Researcher A (Summer 2019 Affiliate List)", + "is_verified": false + }, + { + "id": 12, + "name": "Brand B", + "is_verified": false + } + + ], + "country_name": "China", + "claim_info": null, + "other_locations": [] + }, + "confidence": 0.8153 + } + ], + "item_id": 964, + "geocoded_geometry": { + "type": "Point", + "coordinates": [ + 120.596047, + 32.172013 + ] + }, + "geocoded_address": "Guoyuanzhen, Rugao, Nantong, Jiangsu, China", + "status": "MATCHED", + "oar_id": "CN2019303BQ3FZP" + } + + ### Potential Match + + { + "matches": [ + { + "id": "CN2019303BQ3FZP", + "type": "Feature", + "geometry": { + "type": "Point", + "coordinates": [ + 120.596047, + 32.172013 + ] + }, + "properties": { + "name": "Nantong Jackbeanie Headwear Garment Co. Ltd.", + "address": "No. 808, The Third Industry Park, Guoyuan Town, Rugao City Nantong", + "country_code": "CN", + "oar_id": "CN2019303BQ3FZP", + "other_names": [], + "other_addresses": [], + "contributors": [ + { + "id": 4, + "name": "Researcher A (Summer 2019 Affiliate List)", + "is_verified": false + } + ], + "country_name": "China", + "claim_info": null, + "other_locations": [] + }, + "confidence": 0.7686 + } + ], + "item_id": 959, + "geocoded_geometry": { + "type": "Point", + "coordinates": [ + 120.596047, + 32.172013 + ] + }, + "geocoded_address": "Guoyuanzhen, Rugao, Nantong, Jiangsu, China", + "status": "POTENTIAL_MATCH" + } + + + ### New Facility + + { + "matches": [], + "item_id": 954, + "geocoded_geometry": { + "type": "Point", + "coordinates": [ + 119.2221539, + 33.79772 + ] + }, + "geocoded_address": "30, 32 Yanhuang Ave, Lianshui Xian, Huaian Shi, Jiangsu Sheng, China, 223402", + "status": "NEW_FACILITY" + } + + ### No Match and Geocoder Returned No Results + + { + "matches": [], + "item_id": 965, + "geocoded_geometry": null, + "geocoded_address": null, + "status": "ERROR_MATCHING" + } + """ # noqa # Adding the @permission_classes decorator was not working so we # explicitly invoke our custom permission class. if not IsRegisteredAndConfirmed().has_permission(request, self):