diff --git a/CHANGELOG.md b/CHANGELOG.md index 9097c7c61..f6f95099d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +- Add single facility submission endpoint [#896](https://github.com/open-apparel-registry/open-apparel-registry/pull/896) ### Changed - Replace facility history feature switch with feature flag [#881](https://github.com/open-apparel-registry/open-apparel-registry/pull/881) diff --git a/src/django/api/constants.py b/src/django/api/constants.py index 8c139b93c..6ca2cdadc 100644 --- a/src/django/api/constants.py +++ b/src/django/api/constants.py @@ -40,6 +40,17 @@ class FacilityMergeQueryParams: MERGE = 'merge' +class FacilityCreateQueryParams: + CREATE = 'create' + PUBLIC = 'public' + + +class FeatureGroups: + CAN_GET_FACILITY_HISTORY = 'can_get_facility_history' + CAN_SUBMIT_FACILITY = 'can_submit_facility' + CAN_SUBMIT_PRIVATE_FACILITY = 'can_submit_private_facility' + + class FacilityHistoryActions: CREATE = 'CREATE' UPDATE = 'UPDATE' diff --git a/src/django/api/management/commands/batch_process.py b/src/django/api/management/commands/batch_process.py index 2cc4be5ab..3213543c3 100644 --- a/src/django/api/management/commands/batch_process.py +++ b/src/django/api/management/commands/batch_process.py @@ -6,9 +6,9 @@ from api.constants import ProcessingAction from api.models import FacilityList, FacilityListItem +from api.matching import match_facility_list_items from api.processing import (parse_facility_list_item, geocode_facility_list_item, - match_facility_list_items, save_match_details) LINE_ITEM_ACTIONS = { diff --git a/src/django/api/matching.py b/src/django/api/matching.py new file mode 100644 index 000000000..f9df7db15 --- /dev/null +++ b/src/django/api/matching.py @@ -0,0 +1,421 @@ +import dedupe +import logging +import os +import re +import threading + +from collections import defaultdict +from datetime import datetime +from django.conf import settings +from django.db import transaction +from django.db.models import Q, Max +from unidecode import unidecode + +from api.models import (Facility, + FacilityList, + FacilityListItem, + HistoricalFacility) + +logger = logging.getLogger(__name__) + + +def clean(column): + """ + Remove punctuation and excess whitespace from a value before using it to + find matches. This should be the same function used when developing the + training data read from training.json as part of train_gazetteer. + """ + column = unidecode(column) + column = re.sub('\n', ' ', column) + column = re.sub('-', '', column) + column = re.sub('/', ' ', column) + column = re.sub("'", '', column) + column = re.sub(",", '', column) + column = re.sub(":", ' ', column) + column = re.sub(' +', ' ', column) + column = column.strip().strip('"').strip("'").lower().strip() + if not column: + column = None + return column + + +def get_canonical_items(): + """ + Fetch all `Facility` items and create a dictionary suitable for use by a + Dedupe model. + + Returns: + A dictionary. The key is the `Facility` OAR ID. The value is a dictionary + of clean field values keyed by field name (country, name, address). A + "clean" value is one which has been passed through the `clean` function. + """ + facility_set = Facility.objects.all().extra( + select={'country': 'country_code'}).values( + 'id', 'country', 'name', 'address') + return {str(i['id']): {k: clean(i[k]) for k in i if k != 'id'} + for i in facility_set} + + +def get_messy_items_from_facility_list(facility_list): + """ + Fetch all `FacilityListItem` objects that belong to the specified + `FacilityList` and create a dictionary suitable for use by a Dedupe model. + + Arguments: + facility_list -- A `FacilityList`. + + Returns: + A dictionary. The key is the `FacilityListItem` ID. The value is a + dictionary of clean field values keyed by field name (country, name, + address). A "clean" value is one which has been passed through the `clean` + function. + """ + facility_list_item_set = facility_list.source.facilitylistitem_set.filter( + Q(status=FacilityListItem.GEOCODED) + | Q(status=FacilityListItem.GEOCODED_NO_RESULTS)).extra( + select={'country': 'country_code'}).values( + 'id', 'country', 'name', 'address') + return {str(i['id']): {k: clean(i[k]) for k in i if k != 'id'} + for i in facility_list_item_set} + + +def get_messy_items_for_training(mod_factor=5): + """ + Fetch a subset of `FacilityListItem` objects that have been parsed and are + not in an error state. + + Arguments: + mod_factor -- Used to partition a subset of `FacilityListItem` records. The + larger the value, the fewer records will be contained in the + subset. + + Returns: + A dictionary. The key is the `FacilityListItem` ID. The value is a + dictionary of clean field values keyed by field name (country, name, + address). A "clean" value is one which has been passed through the `clean` + function. + """ + facility_list_item_set = FacilityListItem.objects.exclude( + Q(status=FacilityListItem.UPLOADED) + | Q(status=FacilityListItem.ERROR) + | Q(status=FacilityListItem.ERROR_PARSING) + | Q(status=FacilityListItem.ERROR_GEOCODING) + | Q(status=FacilityListItem.ERROR_MATCHING) + ).extra( + select={'country': 'country_code'}).values( + 'id', 'country', 'name', 'address') + records = [record for (i, record) in enumerate(facility_list_item_set) + if i % mod_factor == 0] + return {str(i['id']): {k: clean(i[k]) for k in i if k != 'id'} + for i in records} + + +def train_gazetteer(messy, canonical, model_settings=None, should_index=False): + """ + Train and return a dedupe.Gazetteer using the specified messy and canonical + dictionaries. The messy and canonical objects should have the same + structure: + - The key is a unique ID + - The value is another dictionary of field:value pairs. This dictionary + must contain at least 'country', 'name', and 'address' keys. + + Reads a training.json file containing positive and negative matches. + """ + if model_settings: + gazetteer = dedupe.StaticGazetteer(model_settings) + else: + fields = [ + {'field': 'country', 'type': 'Exact'}, + {'field': 'name', 'type': 'String'}, + {'field': 'address', 'type': 'String'}, + ] + + gazetteer = dedupe.Gazetteer(fields) + gazetteer.sample(messy, canonical, 15000) + training_file = os.path.join(settings.BASE_DIR, 'api', 'data', + 'training.json') + with open(training_file) as tf: + gazetteer.readTraining(tf) + gazetteer.train() + gazetteer.cleanupTraining() + + if should_index: + index_start = datetime.now() + logger.info('Indexing started') + gazetteer.index(canonical) + index_duration = datetime.now() - index_start + logger.info('Indexing finished ({})'.format(index_duration)) + logger.info('Cleanup training') + + return gazetteer + + +class MatchDefaults: + AUTOMATIC_THRESHOLD = 0.8 + GAZETTEER_THRESHOLD = 0.5 + RECALL_WEIGHT = 1.0 + + +class NoCanonicalRecordsError(Exception): + pass + + +def match_items(messy, + automatic_threshold=MatchDefaults.AUTOMATIC_THRESHOLD, + gazetteer_threshold=MatchDefaults.GAZETTEER_THRESHOLD, + recall_weight=MatchDefaults.RECALL_WEIGHT): + """ + Attempt to match each of the "messy" items specified with a "canonical" + item. + + This function reads from but does not update the database. + + When an argument description mentions a "clean" value it is referring to a + value that has been passed through the `clean` function. + + Arguments: + messy -- A dictionary. The key is the unique identifier of each item to be + matched. The value is a dictionary of clean field values keyed by + field name (country, name, address). + automatic_threshold -- A number from 0.0 to 1.0. A match with a confidence + score greater than this value will be assigned + automatically. + gazetteer_threshold -- A number from 0.0 to 1.0. A match with a confidence + score between this value and the + `automatic_threshold` will be considers a match that + requires confirmation. + recall_weight -- Sets the tradeoff between precision and recall. A value of + 1.0 give an equal weight to precision and recall. + https://en.wikipedia.org/wiki/Precision_and_recall + https://docs.dedupe.io/en/latest/Choosing-a-good-threshold.html + + Returns: + An dict containing the results of the matching process and contains the + following keys: + + processed_list_item_ids -- A list of all the keys in `messy` that were + considered for matching. + item_matches -- A dictionary where the keys are `messy` keys and the values + are lists of tuples where the first element is a key from + `canonical` representing an item that is a potential match + and the second element is the confidence score of the + match. + results -- A dictionary containing additional information about the + matching process that pertains to all the `messy` items and + contains the following keys: + gazetteer_threshold -- The threshold computed from the trained model + automatic_threshold -- The value of the automatic_threshold parameter + returned for convenience + recall_weight -- The value of the recall_weight parameter returned for + convenience. + code_version -- The value of the GIT_COMMIT setting. + started -- The date and time at which the training and matching was + started. + finished -- The date and time at which the training and matching was + finished. + """ + started = str(datetime.utcnow()) + if len(messy.keys()) > 0: + no_geocoded_items = False + try: + gazetteer = GazetteerCache.get_latest() + gazetteer.threshold(messy, recall_weight=recall_weight) + results = gazetteer.match(messy, threshold=gazetteer_threshold, + n_matches=None, generator=True) + no_gazetteer_matches = False + except NoCanonicalRecordsError: + results = [] + no_gazetteer_matches = True + except dedupe.core.BlockingError: + results = [] + no_gazetteer_matches = True + else: + results = [] + no_gazetteer_matches = Facility.objects.count() == 0 + no_geocoded_items = len(messy.keys()) == 0 + + finished = str(datetime.utcnow()) + + item_matches = defaultdict(list) + for matches in results: + for (messy_id, canon_id), score in matches: + item_matches[messy_id].append((canon_id, score)) + + return { + 'processed_list_item_ids': list(messy.keys()), + 'item_matches': item_matches, + 'results': { + 'no_gazetteer_matches': no_gazetteer_matches, + 'no_geocoded_items': no_geocoded_items, + 'gazetteer_threshold': gazetteer_threshold, + 'automatic_threshold': automatic_threshold, + 'recall_weight': recall_weight, + 'code_version': settings.GIT_COMMIT + }, + 'started': started, + 'finished': finished + } + + +def match_facility_list_items( + facility_list, + automatic_threshold=MatchDefaults.AUTOMATIC_THRESHOLD, + gazetteer_threshold=MatchDefaults.GAZETTEER_THRESHOLD, + recall_weight=MatchDefaults.RECALL_WEIGHT): + """ + Fetch items from the specified `FacilityList` and match them to the current + list of facilities. + + Arguments: + facility_list -- A FacilityList instance + automatic_threshold -- A number from 0.0 to 1.0. A match with a confidence + score greater than this value will be assigned + automatically. + gazetteer_threshold -- A number from 0.0 to 1.0. A match with a confidence + score between this value and the + `automatic_threshold` will be considers a match that + requires confirmation. + recall_weight -- Sets the tradeoff between precision and recall. A value of + 1.0 give an equal weight to precision and recall. + https://en.wikipedia.org/wiki/Precision_and_recall + https://docs.dedupe.io/en/latest/Choosing-a-good-threshold.html + + Returns: + See `match_items`. + + """ + if type(facility_list) != FacilityList: + raise ValueError('Argument must be a FacilityList') + + return match_items(get_messy_items_from_facility_list(facility_list), + automatic_threshold=automatic_threshold, + gazetteer_threshold=gazetteer_threshold, + recall_weight=recall_weight) + + +def match_item(country, + name, + address, + id='id', + automatic_threshold=MatchDefaults.AUTOMATIC_THRESHOLD, + gazetteer_threshold=MatchDefaults.GAZETTEER_THRESHOLD, + recall_weight=MatchDefaults.RECALL_WEIGHT): + """ + Match the details of a single facility to the list of existing facilities. + + Arguments: + country -- A valid country name or 2-character ISO code. + name -- The name of the facility. + address -- The address of the facility. + id -- The key value in the returned match results. + automatic_threshold -- A number from 0.0 to 1.0. A match with a confidence + score greater than this value will be assigned + automatically. + gazetteer_threshold -- A number from 0.0 to 1.0. A match with a confidence + score between this value and the + `automatic_threshold` will be considers a match that + requires confirmation. + recall_weight -- Sets the tradeoff between precision and recall. A value of + 1.0 give an equal weight to precision and recall. + https://en.wikipedia.org/wiki/Precision_and_recall + https://docs.dedupe.io/en/latest/Choosing-a-good-threshold.html + + Returns: + See `match_items`. + """ + return match_items( + { + str(id): { + "country": clean(country), + "name": clean(name), + "address": clean(address) + } + }, + automatic_threshold=automatic_threshold, + gazetteer_threshold=gazetteer_threshold, + recall_weight=recall_weight) + + +def facility_to_dedupe_record(facility): + """ + Convert a `Facility` into a dictionary suitable for training and indexing a + Dedupe model. + + Arguments: + facility -- A `Facility` object. + + Returns: + A dictionary with the `Facility` id as the key and a dictionary of fields + as the value. + """ + return { + str(facility.id): { + "country": clean(facility.country_code), + "name": clean(facility.name), + "address": clean(facility.address), + } + } + + +class GazetteerCache: + """ + A container for holding a single, trained and indexed Gazetteer in memory, + which is updated with any `Facility` rows that have been added, updated, or + removed since the previous call to the `get_latest` class method. + + Note that the first time `get_latest` is called it will be slow, as it + needs to train a model and index it with all the `Facility` items. + """ + _lock = threading.Lock() + _gazetter = None + _version = None + + @classmethod + @transaction.atomic + def get_latest(cls): + with cls._lock: + db_version = HistoricalFacility.objects.aggregate( + max_id=Max('history_id')).get('max_id') + if cls._gazetter is None: + canonical = get_canonical_items() + if len(canonical.keys()) == 0: + raise NoCanonicalRecordsError() + cls._gazetter = train_gazetteer(get_messy_items_for_training(), + canonical, + should_index=True) + cls._version = db_version + if db_version != cls._version: + if cls._version is None: + last_version_id = 0 + else: + last_version_id = cls._version + changes = HistoricalFacility \ + .objects \ + .filter(history_id__gt=last_version_id) \ + .order_by('history_id') \ + .extra(select={'country': 'country_code'}) \ + .values('id', 'country', 'name', 'address', + 'history_type') + for item in changes: + if item['history_type'] == '-': + cls._gazetter.unindex({ + item['id']: { + 'country': clean(item['country']), + 'name': clean(item['name']), + 'address': clean(item['address']), + } + }) + else: + # The history record has old field values, so we + # fetch the latest version + try: + facility = Facility.objects.get(id=item['id']) + cls._gazetter.index( + facility_to_dedupe_record(facility)) + except Facility.DoesNotExist: + # If the facility no longer exists, just skip + # indexing. + pass + cls._version = db_version + + return cls._gazetter diff --git a/src/django/api/migrations/0038_add_facility_history_flag.py b/src/django/api/migrations/0038_add_facility_history_flag.py index d3acf5c7e..9b04b85d7 100644 --- a/src/django/api/migrations/0038_add_facility_history_flag.py +++ b/src/django/api/migrations/0038_add_facility_history_flag.py @@ -1,15 +1,19 @@ from django.db import migrations +from api.constants import FeatureGroups + def create_can_get_facility_history(apps, schema_editor): Group = apps.get_model('auth', 'Group') - history_group = Group.objects.create(name='can_get_facility_history') + history_group = Group.objects.create( + name=FeatureGroups.CAN_GET_FACILITY_HISTORY) Flag = apps.get_model('waffle', 'Flag') - history_flag = Flag.objects.create(name='can_get_facility_history', - superusers=True, - staff=False, - note='Used for history API endpoint authorization') + history_flag = Flag.objects.create( + name=FeatureGroups.CAN_GET_FACILITY_HISTORY, + superusers=True, + staff=False, + note='Used for history API endpoint authorization') history_flag.groups.set([history_group]) diff --git a/src/django/api/migrations/0040_add_submit_facility_flag.py b/src/django/api/migrations/0040_add_submit_facility_flag.py new file mode 100644 index 000000000..6be7a5e4b --- /dev/null +++ b/src/django/api/migrations/0040_add_submit_facility_flag.py @@ -0,0 +1,26 @@ +from django.db import migrations + + +def create_can_submit_facility(apps, schema_editor): + Group = apps.get_model('auth', 'Group') + submit_facility_group = Group.objects.create(name='can_submit_facility') + + Flag = apps.get_model('waffle', 'Flag') + submit_facility_flag = Flag.objects.create( + name='can_submit_facility', + superusers=True, + staff=False, + note='Used for single facility submission API endpoint authorization') + + submit_facility_flag.groups.set([submit_facility_group]) + + +class Migration(migrations.Migration): + + dependencies = [ + ('api', '0039_delete_facility_history_switch'), + ] + + operations = [ + migrations.RunPython(create_can_submit_facility), + ] diff --git a/src/django/api/migrations/0041_add_private_facility_flag.py b/src/django/api/migrations/0041_add_private_facility_flag.py new file mode 100644 index 000000000..6931cf6d7 --- /dev/null +++ b/src/django/api/migrations/0041_add_private_facility_flag.py @@ -0,0 +1,29 @@ +from django.db import migrations + +from api.constants import FeatureGroups + + +def create_can_submit_private_facility(apps, schema_editor): + Group = apps.get_model('auth', 'Group') + submit_private_facility_group = Group.objects.create( + name=FeatureGroups.CAN_SUBMIT_PRIVATE_FACILITY) + + Flag = apps.get_model('waffle', 'Flag') + submit_private_facility_flag = Flag.objects.create( + name=FeatureGroups.CAN_SUBMIT_PRIVATE_FACILITY, + superusers=True, + staff=False, + note='Used for private facility submission API param authorization') + + submit_private_facility_flag.groups.set([submit_private_facility_group]) + + +class Migration(migrations.Migration): + + dependencies = [ + ('api', '0040_add_submit_facility_flag'), + ] + + operations = [ + migrations.RunPython(create_can_submit_private_facility), + ] diff --git a/src/django/api/processing.py b/src/django/api/processing.py index 5cab622de..c31342d84 100644 --- a/src/django/api/processing.py +++ b/src/django/api/processing.py @@ -1,23 +1,17 @@ import csv -import os import traceback -import re import sys -import dedupe import xlrd -from collections import defaultdict from datetime import datetime -from unidecode import unidecode from django.conf import settings from django.contrib.gis.geos import Point from django.core.exceptions import ValidationError -from django.db.models import Q from api.constants import CsvHeaderField, ProcessingAction -from api.models import Facility, FacilityMatch, FacilityList, FacilityListItem +from api.models import Facility, FacilityMatch, FacilityListItem from api.countries import COUNTRY_CODES, COUNTRY_NAMES from api.geocoding import geocode_address @@ -217,157 +211,6 @@ def geocode_facility_list_item(item): }) -def clean(column): - """ - Remove punctuation and excess whitespace from a value before using it to - find matches. This should be the same function used when developing the - training data read from training.json as part of train_gazetteer. - """ - column = unidecode(column) - column = re.sub('\n', ' ', column) - column = re.sub('-', '', column) - column = re.sub('/', ' ', column) - column = re.sub("'", '', column) - column = re.sub(",", '', column) - column = re.sub(":", ' ', column) - column = re.sub(' +', ' ', column) - column = column.strip().strip('"').strip("'").lower().strip() - if not column: - column = None - return column - - -def train_gazetteer(messy, canonical): - """ - Train and return a dedupe.Gazetteer using the specified messy and canonical - dictionaries. The messy and canonical objects should have the same - structure: - - The key is a unique ID - - The value is another dictionary of field:value pairs. This dictionary - must contain at least 'country', 'name', and 'address' keys. - - Reads a training.json file containing positive and negative matches. - """ - fields = [ - {'field': 'country', 'type': 'Exact'}, - {'field': 'name', 'type': 'String'}, - {'field': 'address', 'type': 'String'}, - ] - - gazetteer = dedupe.Gazetteer(fields) - gazetteer.sample(messy, canonical, 15000) - training_file = os.path.join(settings.BASE_DIR, 'api', 'data', - 'training.json') - with open(training_file) as tf: - gazetteer.readTraining(tf) - gazetteer.train() - gazetteer.index(canonical) - gazetteer.cleanupTraining() - # The gazetteer example in the dedupeio/dedupe-examples repository called - # index both after training and after calling cleanupTraining. - gazetteer.index(canonical) - return gazetteer - - -def match_facility_list_items(facility_list, automatic_threshold=0.8, - gazetteer_threshold=0.5, recall_weight=1.0): - """ - Attempt to match all the items in the specified FacilityList that are in - the GEOCODED status with existing facilities. - - This function reads from but does not update the database. - - Arguments: - facility_list -- A FacilityList instance - automatic_threshold -- A number from 0.0 to 1.0. A match with a confidence - score greater than this value will be assigned - automatically. - recall_weight -- Sets the tradeoff between precision and recall. A value of - 1.0 give an equal weight to precision and recall. - https://en.wikipedia.org/wiki/Precision_and_recall - https://docs.dedupe.io/en/latest/Choosing-a-good-threshold.html - - Returns: - An dict containing the results of the matching process and contains the - following keys: - - processed_list_item_ids -- A list of all the FacilityListItem IDs that were - considered for matching. - item_matches -- A dictionary where the keys are FacilityListItem IDs and - the values are lists of tuples where the first element is - the ID of a Facility that is a potential match and the - second element is the confidence score of the match. - results -- A dictionary containing additional information about the - matching process that pertains to all the list items and - contains the following keys: - gazetteer_threshold -- The threshold computed from the trained model - automatic_threshold -- The value of the automatic_threshold parameter - returned for convenience - recall_weight -- The value of the recall_weight parameter returned for - convenience. - code_version -- The value of the GIT_COMMIT setting. - started -- The date and time at which the training and matching was - started. - finished -- The date and time at which the training and matching was - finished. - """ - started = str(datetime.utcnow()) - if type(facility_list) != FacilityList: - raise ValueError('Argument must be a FacilityList') - - facility_set = Facility.objects.all().extra( - select={'country': 'country_code'}).values( - 'id', 'country', 'name', 'address') - canonical = {str(i['id']): {k: clean(i[k]) for k in i if k != 'id'} - for i in facility_set} - - facility_list_item_set = facility_list.source.facilitylistitem_set.filter( - Q(status=FacilityListItem.GEOCODED) - | Q(status=FacilityListItem.GEOCODED_NO_RESULTS)).extra( - select={'country': 'country_code'}).values( - 'id', 'country', 'name', 'address') - messy = {str(i['id']): {k: clean(i[k]) for k in i if k != 'id'} - for i in facility_list_item_set} - - if len(canonical.keys()) > 0 and len(messy.keys()) > 0: - no_geocoded_items = False - gazetteer = train_gazetteer(messy, canonical) - try: - gazetteer.threshold(messy, recall_weight=recall_weight) - results = gazetteer.match(messy, threshold=gazetteer_threshold, - n_matches=None, generator=True) - no_gazetteer_matches = False - except dedupe.core.BlockingError: - results = [] - no_gazetteer_matches = True - else: - results = [] - no_gazetteer_matches = len(canonical.keys()) == 0 - no_geocoded_items = len(messy.keys()) == 0 - - finished = str(datetime.utcnow()) - - item_matches = defaultdict(list) - for matches in results: - for (messy_id, canon_id), score in matches: - item_matches[messy_id].append((canon_id, score)) - - return { - 'processed_list_item_ids': list(messy.keys()), - 'item_matches': item_matches, - 'results': { - 'no_gazetteer_matches': no_gazetteer_matches, - 'no_geocoded_items': no_geocoded_items, - 'gazetteer_threshold': gazetteer_threshold, - 'automatic_threshold': automatic_threshold, - 'recall_weight': recall_weight, - 'code_version': settings.GIT_COMMIT - }, - 'started': started, - 'finished': finished - } - - def save_match_details(match_results): """ Save the results of a call to match_facility_list_items by creating @@ -427,8 +270,9 @@ def make_pending_match(item_id, facility_id, score): }) item.save() - for m in matches: - m.save() + if item.source.create: + for m in matches: + m.save() unmatched = (FacilityListItem.objects .filter(id__in=processed_list_item_ids) @@ -445,19 +289,20 @@ def make_pending_match(item_id, facility_id, score): 'finished_at': finished }) else: - facility = Facility(name=item.name, - address=item.address, - country_code=item.country_code, - location=item.geocoded_point, - created_from=item) - facility.save() - - match = make_pending_match(item.id, facility.id, 1.0) - match.results['match_type'] = 'no_gazetteer_match' - match.status = FacilityMatch.AUTOMATIC - match.save() - - item.facility = facility + if item.source.create: + facility = Facility(name=item.name, + address=item.address, + country_code=item.country_code, + location=item.geocoded_point, + created_from=item) + facility.save() + + match = make_pending_match(item.id, facility.id, 1.0) + match.results['match_type'] = 'no_gazetteer_match' + match.status = FacilityMatch.AUTOMATIC + match.save() + + item.facility = facility item.status = FacilityListItem.MATCHED item.processing_results.append({ 'action': ProcessingAction.MATCH, diff --git a/src/django/api/serializers.py b/src/django/api/serializers.py index bf393aba0..408cedca6 100644 --- a/src/django/api/serializers.py +++ b/src/django/api/serializers.py @@ -12,6 +12,7 @@ EmailField, IntegerField, ListField, + BooleanField, ModelSerializer, SerializerMethodField, ValidationError, @@ -33,6 +34,7 @@ ProductType, ProductionType) from api.countries import COUNTRY_NAMES, COUNTRY_CHOICES +from api.processing import get_country_code from waffle import switch_is_active @@ -538,6 +540,23 @@ def get_claim_info(self, facility): return None +class FacilityCreateBodySerializer(Serializer): + country = CharField(required=True) + name = CharField(required=True, max_length=200) + address = CharField(required=True, max_length=200) + + def validate_country(self, value): + try: + return get_country_code(value) + except ValueError as ve: + raise ValidationError(ve) + + +class FacilityCreateQueryParamsSerializer(Serializer): + create = BooleanField(default=True, required=False) + public = BooleanField(default=True, required=False) + + class FacilityClaimSerializer(ModelSerializer): facility_name = SerializerMethodField() oar_id = SerializerMethodField() diff --git a/src/django/api/tests.py b/src/django/api/tests.py index 127d24397..450b9ea5c 100644 --- a/src/django/api/tests.py +++ b/src/django/api/tests.py @@ -17,21 +17,23 @@ from api.constants import (ProcessingAction, LogDownloadQueryParams, - UpdateLocationParams) + UpdateLocationParams, + FeatureGroups) from api.models import (Facility, FacilityList, FacilityListItem, FacilityClaim, FacilityClaimReviewNote, FacilityMatch, FacilityAlias, Contributor, User, RequestLog, DownloadLog, FacilityLocation, Source) from api.oar_id import make_oar_id, validate_oar_id +from api.matching import match_facility_list_items from api.processing import (parse_facility_list_item, - geocode_facility_list_item, - match_facility_list_items) + geocode_facility_list_item) from api.geocoding import (create_geocoding_params, format_geocoded_address_data, geocode_address) from api.test_data import parsed_city_hall_data from api.permissions import referring_host_is_allowed, referring_host -from api.serializers import ApprovedFacilityClaimSerializer +from api.serializers import (ApprovedFacilityClaimSerializer, + FacilityCreateBodySerializer) class FacilityListCreateTest(APITestCase): @@ -1129,7 +1131,7 @@ def interspace(string): def junk_chars(string): - return string + 'YY' + return 'AA' + string + 'YY' class DedupeMatchingTests(TestCase): @@ -3953,7 +3955,7 @@ def setUp(self): self.facility_two.id ) - @override_flag('can_get_facility_history', active=True) + @override_flag(FeatureGroups.CAN_GET_FACILITY_HISTORY, active=True) def test_serializes_deleted_facility_history(self): delete_facility_url = '/api/facilities/{}/'.format(self.facility.id) delete_response = self.client.delete(delete_facility_url) @@ -4695,3 +4697,137 @@ def test_in_group_receives_200(self): history_response.status_code, 200, ) + + +class FacilitySubmitTest(FacilityAPITestCaseBase): + def setUp(self): + super(FacilitySubmitTest, self).setUp() + self.url = reverse('facility-list') + self.valid_facility = { + 'country': 'United States', + 'name': 'Pants Hut', + 'address': '123 Main St, Anywhereville, PA' + } + + def join_group_and_login(self): + self.client.logout() + group = auth.models.Group.objects.get( + name=FeatureGroups.CAN_SUBMIT_FACILITY, + ) + self.user.groups.set([group.id]) + self.user.save() + self.client.login(email=self.user_email, + password=self.user_password) + + def test_unauthenticated_receives_401(self): + self.client.logout() + response = self.client.post(self.url) + self.assertEqual(response.status_code, 401) + + def test_not_in_group_receives_403(self): + self.client.logout() + self.client.login(email=self.user_email, + password=self.user_password) + + response = self.client.post(self.url) + + self.assertEqual(response.status_code, 403) + + def test_empty_body_is_invalid(self): + self.join_group_and_login() + response = self.client.post(self.url) + self.assertEqual(response.status_code, 400) + + def test_missing_fields_are_invalid(self): + self.join_group_and_login() + + response = self.client.post(self.url, { + 'country': 'US', + 'name': 'Something', + }) + self.assertEqual(response.status_code, 400) + + response = self.client.post(self.url, { + 'country': 'US', + 'address': 'Some street', + }) + self.assertEqual(response.status_code, 400) + + response = self.client.post(self.url, { + 'name': 'Something', + 'address': 'Some street', + }) + self.assertEqual(response.status_code, 400) + + def test_valid_request(self): + self.join_group_and_login() + response = self.client.post(self.url, self.valid_facility) + self.assertEqual(response.status_code, 201) + + def test_valid_request_with_params(self): + self.join_group_and_login() + url_with_query = '{}?create=false&public=true'.format(self.url) + response = self.client.post(url_with_query, self.valid_facility) + self.assertEqual(response.status_code, 200) + + def test_private_permission(self): + self.join_group_and_login() + url_with_query = '{}?public=false'.format(self.url) + response = self.client.post(url_with_query, self.valid_facility) + self.assertEqual(response.status_code, 403) + + group = auth.models.Group.objects.get( + name=FeatureGroups.CAN_SUBMIT_PRIVATE_FACILITY, + ) + self.user.groups.add(group.id) + self.user.save() + + response = self.client.post(url_with_query, self.valid_facility) + self.assertEqual(response.status_code, 201) + + +class FacilityCreateBodySerializerTest(TestCase): + def test_valid_data(self): + serializer = FacilityCreateBodySerializer(data={ + 'country': 'United States', + 'name': 'Pants Hut', + 'address': '123 Main St, Anywhereville, PA' + }) + self.assertTrue(serializer.is_valid()) + + def test_missing_fields(self): + serializer = FacilityCreateBodySerializer(data={ + 'name': 'Pants Hut', + 'address': '123 Main St, Anywhereville, PA' + }) + self.assertFalse(serializer.is_valid()) + self.assertIn('country', serializer.errors) + self.assertNotIn('name', serializer.errors) + self.assertNotIn('address', serializer.errors) + + serializer = FacilityCreateBodySerializer(data={ + 'country': 'United States', + 'address': '123 Main St, Anywhereville, PA' + }) + self.assertFalse(serializer.is_valid()) + self.assertIn('name', serializer.errors) + self.assertNotIn('country', serializer.errors) + self.assertNotIn('address', serializer.errors) + + serializer = FacilityCreateBodySerializer(data={ + 'country': 'United States', + 'name': 'Pants Hut', + }) + self.assertFalse(serializer.is_valid()) + self.assertIn('address', serializer.errors) + self.assertNotIn('country', serializer.errors) + self.assertNotIn('name', serializer.errors) + + def test_invalid_country(self): + serializer = FacilityCreateBodySerializer(data={ + 'country': 'Notrealia', + 'name': 'Pants Hut', + 'address': '123 Main St, Anywhereville, PA' + }) + self.assertFalse(serializer.is_valid()) + self.assertIn('country', serializer.errors) diff --git a/src/django/api/views.py b/src/django/api/views.py index 262ac528a..d8a8dfa4b 100644 --- a/src/django/api/views.py +++ b/src/django/api/views.py @@ -1,6 +1,7 @@ import operator import os import sys +import traceback from datetime import datetime from functools import reduce @@ -51,9 +52,13 @@ FacilityListQueryParams, FacilityListItemsQueryParams, FacilityMergeQueryParams, + FacilityCreateQueryParams, ProcessingAction, LogDownloadQueryParams, - UpdateLocationParams) + UpdateLocationParams, + FeatureGroups) +from api.geocoding import geocode_address +from api.matching import match_item from api.models import (FacilityList, FacilityListItem, FacilityClaim, @@ -67,7 +72,11 @@ Version, FacilityLocation, Source) -from api.processing import parse_csv_line, parse_csv, parse_excel +from api.processing import (parse_csv_line, + parse_csv, + parse_excel, + get_country_code, + save_match_details) from api.serializers import (FacilityListSerializer, FacilityListItemSerializer, FacilityListItemsQueryParamsSerializer, @@ -75,6 +84,8 @@ FacilityListQueryParamsSerializer, FacilitySerializer, FacilityDetailsSerializer, + FacilityCreateBodySerializer, + FacilityCreateQueryParamsSerializer, UserSerializer, UserProfileSerializer, FacilityClaimSerializer, @@ -486,6 +497,29 @@ def get_schema_fields(self, view): ), ] + if view.action == 'create': + return [ + coreapi.Field( + name='create', + location='query', + type='boolean', + required=False, + description=( + 'If false, match results will be returned, but a new ' + 'facility or facility match will not be saved'), + ), + coreapi.Field( + name='public', + location='query', + type='boolean', + required=False, + description=( + 'If false and a new facility or facility match is ' + 'created, the contributor will not be publicly ' + 'associated with the facility'), + ), + ] + return [] @@ -511,11 +545,30 @@ def get_link(self, path, method, base_url): return super(FacilitiesAutoSchema, self).get_link( path, method, base_url) + def _allows_filters(self, path, method): + return True + + def get_serializer_fields(self, path, method): + if method == 'POST': + return [ + coreapi.Field( + name='data', + location='body', + description=( + 'The country, name, and address of the facility. See ' + 'the sample request body above.'), + required=True, + ) + ] + + return [] + @schema(FacilitiesAutoSchema()) class FacilitiesViewSet(mixins.ListModelMixin, mixins.RetrieveModelMixin, mixins.DestroyModelMixin, + mixins.CreateModelMixin, viewsets.GenericViewSet): """ Get facilities in GeoJSON format. @@ -623,6 +676,325 @@ def retrieve(self, request, pk=None): except Facility.DoesNotExist: raise NotFound() + @transaction.atomic + def create(self, request): + """ + Matches submitted facility details to the full list of facilities. + By default, creates a new facility if there is no match, or associates + the authenticated contributor with the facility if there is a confident + match. + + **NOTE** The form below lists the return status code as 201. When + POSTing data with `create=false` the return status will be 200, not + 201. + + ## Sample Request Body + + { + "country": "China", + "name": "Nantong Jackbeanie Headwear & Garment Co. Ltd.", + "address": "No.808,the third industry park,Guoyuan Town,Nantong 226500." + } + + ## Sample Responses + + ### Automatic Match + + { + "matches": [ + { + "id": "CN2019303BQ3FZP", + "type": "Feature", + "geometry": { + "type": "Point", + "coordinates": [ + 120.596047, + 32.172013 + ] + }, + "properties": { + "name": "Nantong Jackbeanie Headwear Garment Co. Ltd.", + "address": "No. 808, The Third Industry Park, Guoyuan Town, Rugao City Nantong", + "country_code": "CN", + "oar_id": "CN2019303BQ3FZP", + "other_names": [], + "other_addresses": [], + "contributors": [ + { + "id": 4, + "name": "Researcher A (Summer 2019 Affiliate List)", + "is_verified": false + }, + { + "id": 12, + "name": "Brand B", + "is_verified": false + } + + ], + "country_name": "China", + "claim_info": null, + "other_locations": [] + }, + "confidence": 0.8153 + } + ], + "item_id": 964, + "geocoded_geometry": { + "type": "Point", + "coordinates": [ + 120.596047, + 32.172013 + ] + }, + "geocoded_address": "Guoyuanzhen, Rugao, Nantong, Jiangsu, China", + "status": "MATCHED", + "oar_id": "CN2019303BQ3FZP" + } + + ### Potential Match + + { + "matches": [ + { + "id": "CN2019303BQ3FZP", + "type": "Feature", + "geometry": { + "type": "Point", + "coordinates": [ + 120.596047, + 32.172013 + ] + }, + "properties": { + "name": "Nantong Jackbeanie Headwear Garment Co. Ltd.", + "address": "No. 808, The Third Industry Park, Guoyuan Town, Rugao City Nantong", + "country_code": "CN", + "oar_id": "CN2019303BQ3FZP", + "other_names": [], + "other_addresses": [], + "contributors": [ + { + "id": 4, + "name": "Researcher A (Summer 2019 Affiliate List)", + "is_verified": false + } + ], + "country_name": "China", + "claim_info": null, + "other_locations": [] + }, + "confidence": 0.7686 + } + ], + "item_id": 959, + "geocoded_geometry": { + "type": "Point", + "coordinates": [ + 120.596047, + 32.172013 + ] + }, + "geocoded_address": "Guoyuanzhen, Rugao, Nantong, Jiangsu, China", + "status": "POTENTIAL_MATCH" + } + + + ### New Facility + + { + "matches": [], + "item_id": 954, + "geocoded_geometry": { + "type": "Point", + "coordinates": [ + 119.2221539, + 33.79772 + ] + }, + "geocoded_address": "30, 32 Yanhuang Ave, Lianshui Xian, Huaian Shi, Jiangsu Sheng, China, 223402", + "status": "NEW_FACILITY" + } + + ### No Match and Geocoder Returned No Results + + { + "matches": [], + "item_id": 965, + "geocoded_geometry": null, + "geocoded_address": null, + "status": "ERROR_MATCHING" + } + """ # noqa + # Adding the @permission_classes decorator was not working so we + # explicitly invoke our custom permission class. + if not IsRegisteredAndConfirmed().has_permission(request, self): + return Response(status=status.HTTP_401_UNAUTHORIZED) + if not flag_is_active(request._request, + FeatureGroups.CAN_SUBMIT_FACILITY): + raise PermissionDenied() + + body_serializer = FacilityCreateBodySerializer(data=request.data) + body_serializer.is_valid(raise_exception=True) + + params_serializer = FacilityCreateQueryParamsSerializer( + data=request.query_params) + params_serializer.is_valid(raise_exception=True) + should_create = params_serializer.validated_data[ + FacilityCreateQueryParams.CREATE] + public_submission = params_serializer.validated_data[ + FacilityCreateQueryParams.PUBLIC] + private_allowed = flag_is_active( + request._request, FeatureGroups.CAN_SUBMIT_PRIVATE_FACILITY) + if not public_submission and not private_allowed: + raise PermissionDenied('Cannot submit a private facility') + + parse_started = str(datetime.utcnow()) + + source = Source.objects.create( + contributor=request.user.contributor, + source_type=Source.SINGLE, + is_public=public_submission, + create=should_create + ) + + country_code = get_country_code( + body_serializer.validated_data.get('country')) + name = body_serializer.validated_data.get('name') + address = body_serializer.validated_data.get('address') + + item = FacilityListItem.objects.create( + source=source, + row_index=0, + raw_data=request.data, + status=FacilityListItem.PARSED, + name=name, + address=address, + country_code=country_code, + processing_results=[{ + 'action': ProcessingAction.PARSE, + 'started_at': parse_started, + 'error': False, + 'finished_at': str(datetime.utcnow()), + 'is_geocoded': False, + }] + ) + + result = { + 'matches': [], + 'item_id': item.id, + 'geocoded_geometry': None, + 'geocoded_address': None, + 'status': item.status, + } + + geocode_started = str(datetime.utcnow()) + try: + geocode_result = geocode_address(address, country_code) + if geocode_result['result_count'] > 0: + item.status = FacilityListItem.GEOCODED + item.geocoded_point = Point( + geocode_result["geocoded_point"]["lng"], + geocode_result["geocoded_point"]["lat"] + ) + item.geocoded_address = geocode_result["geocoded_address"] + + result['geocoded_geometry'] = { + 'type': 'Point', + 'coordinates': [ + geocode_result["geocoded_point"]["lng"], + geocode_result["geocoded_point"]["lat"], + ] + } + result['geocoded_address'] = item.geocoded_address + else: + item.status = FacilityListItem.GEOCODED_NO_RESULTS + + item.processing_results.append({ + 'action': ProcessingAction.GEOCODE, + 'started_at': geocode_started, + 'error': False, + 'skipped_geocoder': False, + 'data': geocode_result['full_response'], + 'finished_at': str(datetime.utcnow()), + }) + + item.save() + except Exception as e: + item.status = FacilityListItem.ERROR_GEOCODING + item.processing_results.append({ + 'action': ProcessingAction.GEOCODE, + 'started_at': geocode_started, + 'error': True, + 'message': str(e), + 'trace': traceback.format_exc(), + 'finished_at': str(datetime.utcnow()), + }) + item.save() + result['status'] = item.status + return Response(result, + status=status.HTTP_500_INTERNAL_SERVER_ERROR) + + match_started = str(datetime.utcnow()) + try: + match_results = match_item(country_code, name, address, item.id) + save_match_details(match_results) + + automatic_threshold = \ + match_results['results']['automatic_threshold'] + + item_matches = match_results['item_matches'] + for item_id, matches in item_matches.items(): + result['item_id'] = item_id + result['status'] = item.status + for facility_id, score in matches: + facility = Facility.objects.get(id=facility_id) + facility_dict = FacilityDetailsSerializer(facility).data + # calling `round` alone was not trimming digits + facility_dict['confidence'] = float(str(round(score, 4))) + if score < automatic_threshold: + if should_create: + # TODO: Move match confirmation out of the + # FacilityListViewSet and set URLs here + facility_dict['confirm_match_url'] = None + facility_dict['reject_match_url'] = None + result['matches'].append(facility_dict) + except Exception as e: + item.status = FacilityListItem.ERROR_MATCHING + item.processing_results.append({ + 'action': ProcessingAction.MATCH, + 'started_at': match_started, + 'error': True, + 'message': str(e), + 'finished_at': str(datetime.utcnow()) + }) + item.save() + result['status'] = item.status + return Response(result, + status=status.HTTP_500_INTERNAL_SERVER_ERROR) + + item.refresh_from_db() + result['item_id'] = item.id + result['status'] = item.status + if item.facility is not None: + result['oar_id'] = item.facility.id + if item.facility.created_from == item: + result['status'] = FacilityListItem.NEW_FACILITY + elif (item.status == FacilityListItem.MATCHED + and len(item_matches.keys()) == 0): + # This branch handles the case where they client has specified + # `create=false` but there are no matches, which means that we + # would have attempted to create a new `Facility`. + if item.geocoded_point is None: + result['status'] = FacilityListItem.ERROR_MATCHING + else: + result['status'] = FacilityListItem.NEW_FACILITY + + if should_create and result['status'] != FacilityListItem.ERROR_MATCHING: # noqa + return Response(result, status=status.HTTP_201_CREATED) + else: + return Response(result, status=status.HTTP_200_OK) + @transaction.atomic def destroy(self, request, pk=None): if request.user.is_anonymous: @@ -641,17 +1013,21 @@ def destroy(self, request, pk=None): ) now = str(datetime.utcnow()) - list_item = facility.created_from - list_item.status = FacilityListItem.DELETED - list_item.processing_results.append({ - 'action': ProcessingAction.DELETE_FACILITY, - 'started_at': now, - 'error': False, - 'finished_at': now, - 'deleted_oar_id': facility.id, - }) - list_item.facility = None - list_item.save() + list_items = FacilityListItem \ + .objects \ + .filter(facility=facility) \ + .filter(Q(source__create=False) | Q(id=facility.created_from.id)) + for list_item in list_items: + list_item.status = FacilityListItem.DELETED + list_item.processing_results.append({ + 'action': ProcessingAction.DELETE_FACILITY, + 'started_at': now, + 'error': False, + 'finished_at': now, + 'deleted_oar_id': facility.id, + }) + list_item.facility = None + list_item.save() match = facility.get_created_from_match() match.changeReason = 'Deleted {}'.format(facility.id) @@ -1254,7 +1630,8 @@ def get_facility_history(self, request, pk=None): } ] """ - if not flag_is_active(request._request, 'can_get_facility_history'): + if not flag_is_active(request._request, + FeatureGroups.CAN_GET_FACILITY_HISTORY): raise PermissionDenied() historical_facility_queryset = Facility.history.filter(id=pk)