diff --git a/src/django/api/management/commands/batch_process.py b/src/django/api/management/commands/batch_process.py index ea59bb8de..0907cd923 100644 --- a/src/django/api/management/commands/batch_process.py +++ b/src/django/api/management/commands/batch_process.py @@ -6,10 +6,11 @@ from api.constants import ProcessingAction from api.models import FacilityList, FacilityListItem -from api.matching import match_facility_list_items +from api.matching import match_facility_list_items, identify_exact_matches from api.processing import (parse_facility_list_item, geocode_facility_list_item, - save_match_details) + save_match_details, + save_exact_match_details) from api.mail import notify_facility_list_complete LINE_ITEM_ACTIONS = { @@ -69,13 +70,17 @@ def handle(self, *args, **options): total_item_count = \ facility_list.source.facilitylistitem_set.count() - result = match_facility_list_items(facility_list) - success_count = len(result['processed_list_item_ids']) - fail_count = total_item_count - success_count + exact_result = identify_exact_matches(facility_list) + with transaction.atomic(): + save_exact_match_details(exact_result) + result = match_facility_list_items(facility_list) with transaction.atomic(): save_match_details(result) + success_count = len(result['processed_list_item_ids']) + \ + len(exact_result['processed_list_item_ids']) + fail_count = total_item_count - success_count if success_count > 0: self.stdout.write( self.style.SUCCESS( diff --git a/src/django/api/matching.py b/src/django/api/matching.py index 1e461b2d3..86cd4a32d 100644 --- a/src/django/api/matching.py +++ b/src/django/api/matching.py @@ -10,7 +10,7 @@ from django.conf import settings from django.contrib.postgres.search import TrigramSimilarity from django.db import transaction -from django.db.models import Q, Max +from django.db.models import Q, Max, ExpressionWrapper, BooleanField from unidecode import unidecode from api.models import (Facility, @@ -111,6 +111,72 @@ def get_canonical_items(): return items +def exact_match_items(messy, contributor): + started = str(datetime.utcnow()) + + matched_items = FacilityListItem.objects \ + .filter(status__in=[FacilityListItem.MATCHED, + FacilityListItem.CONFIRMED_MATCH]) + active_item_ids = FacilityMatch.objects \ + .filter(status__in=[FacilityMatch.AUTOMATIC, + FacilityMatch.CONFIRMED, + FacilityMatch.MERGED], + is_active=True, + facility_list_item__source__is_active=True) \ + .values_list('facility_list_item') + + results = dict() + + for messy_id, item in messy.items(): + clean_name = clean(item.get('name', '')) + clean_address = clean(item.get('address', '')) + country_code = item.get('country').upper() + exact_matches = matched_items.filter(clean_name=clean_name, + clean_address=clean_address, + country_code=country_code) \ + .annotate(is_active=ExpressionWrapper( + Q(id__in=active_item_ids), + output_field=BooleanField())) \ + .annotate(has_same_contributor=ExpressionWrapper( + Q(source__contributor=contributor), + output_field=BooleanField())) \ + .order_by('-is_active', + '-has_same_contributor', + 'updated_at') \ + .values('facility_id') + + if len(exact_matches) > 0: + results[messy_id] = exact_matches + + finished = str(datetime.utcnow()) + + return { + 'processed_list_item_ids': list(results.keys()), + 'item_matches': results, + 'started': started, + 'finished': finished + } + + +def identify_exact_matches(facility_list): + messy = get_messy_items_from_facility_list(facility_list) + contributor = facility_list.source.contributor + + return exact_match_items(messy, contributor) + + +def exact_match_item(country, name, address, contributor, id='id'): + return exact_match_items( + { + str(id): { + "country": clean(country), + "name": clean(name), + "address": clean(address) + } + }, + contributor) + + def get_messy_items_from_facility_list(facility_list): """ Fetch all `FacilityListItem` objects that belong to the specified diff --git a/src/django/api/migrations/0075_add_fli_clean_name_and_address.py b/src/django/api/migrations/0075_add_fli_clean_name_and_address.py new file mode 100644 index 000000000..5bcc110d3 --- /dev/null +++ b/src/django/api/migrations/0075_add_fli_clean_name_and_address.py @@ -0,0 +1,41 @@ +# Generated by Django 2.2.24 on 2021-12-27 19:40 + +from django.db import migrations, models +from api.matching import clean + + +def populate_cleaned_fields(apps, schema_editor): + FacilityListItem = apps.get_model('api', 'FacilityListItem') + for list_item in FacilityListItem.objects.all(): + list_item.clean_name = clean(list_item.name) + list_item.clean_address = clean(list_item.address) + list_item.save() + + +def do_nothing_on_reverse(apps, schema_editor): + pass + + +class Migration(migrations.Migration): + + dependencies = [ + ('api', '0074_embedconfig_prefer_contributor_name'), + ] + + operations = [ + migrations.AddField( + model_name='facilitylistitem', + name='clean_address', + field=models.CharField(default='', help_text='The cleaned address of the facility.', max_length=200), + ), + migrations.AddField( + model_name='facilitylistitem', + name='clean_name', + field=models.CharField(default='', help_text='The cleaned name of the facility.', max_length=200), + ), + migrations.AddIndex( + model_name='facilitylistitem', + index=models.Index(fields=['country_code', 'clean_name', 'clean_address'], name='api_fli_match_fields_idx'), + ), + migrations.RunPython(populate_cleaned_fields, do_nothing_on_reverse) + ] diff --git a/src/django/api/models.py b/src/django/api/models.py index 39d6181cf..707ae696b 100644 --- a/src/django/api/models.py +++ b/src/django/api/models.py @@ -552,6 +552,9 @@ class Meta: indexes = [ models.Index(fields=['source', 'row_index'], name='api_fli_facility_list_row_idx'), + models.Index(fields=['country_code', 'clean_name', + 'clean_address'], + name='api_fli_match_fields_idx') ] source = models.ForeignKey( @@ -621,6 +624,18 @@ class Meta: help_text=('The facility created from this list item or the ' 'previously existing facility to which this list ' 'item was matched.')) + clean_name = models.CharField( + max_length=200, + null=False, + blank=False, + default='', + help_text='The cleaned name of the facility.') + clean_address = models.CharField( + max_length=200, + null=False, + blank=False, + default='', + help_text='The cleaned address of the facility.') created_at = models.DateTimeField(auto_now_add=True) updated_at = models.DateTimeField(auto_now=True) diff --git a/src/django/api/processing.py b/src/django/api/processing.py index c268464df..2c761548d 100644 --- a/src/django/api/processing.py +++ b/src/django/api/processing.py @@ -115,8 +115,10 @@ def parse_facility_list_item(item): values[fields.index(CsvHeaderField.COUNTRY)]) if CsvHeaderField.NAME in fields: item.name = values[fields.index(CsvHeaderField.NAME)] + item.clean_name = clean(item.name) if CsvHeaderField.ADDRESS in fields: item.address = values[fields.index(CsvHeaderField.ADDRESS)] + item.clean_address = clean(item.address) if CsvHeaderField.LAT in fields and CsvHeaderField.LNG in fields: lat = float(values[fields.index(CsvHeaderField.LAT)]) lng = float(values[fields.index(CsvHeaderField.LNG)]) @@ -492,3 +494,72 @@ def make_pending_match(item_id, facility_id, score): update_extendedfields_for_list_item(item) return all_matches + + +def save_exact_match_details(exact_results): + """ + Save the results of a call to identify_exact_matches by creating + Facility and FacilityMatch instances and updating the state of the affected + FacilityListItems. + + Should be called in a transaction to ensure that all the updates are + applied atomically. + + Arguments: + exact_results -- The dict return value from a call to + identify_exact_matches + + Returns: + The list of `FacilityMatch` objects created + """ + processed_list_item_ids = exact_results['processed_list_item_ids'] + item_matches = exact_results['item_matches'] + started = exact_results['started'] + finished = exact_results['finished'] + + def make_pending_match(item_id, facility_id): + return FacilityMatch( + facility_list_item_id=item_id, + facility_id=facility_id, + confidence=1.0, + status=FacilityMatch.PENDING, + results={}) + + all_matches = [] + for item_id, exact_matches in item_matches.items(): + item = FacilityListItem.objects.get(id=item_id) + item.status = FacilityListItem.POTENTIAL_MATCH + + matches = [make_pending_match(item_id, m.get('facility_id')) + for m in exact_matches] + matches[0].status = FacilityMatch.AUTOMATIC + item.status = FacilityListItem.MATCHED + item.facility_id = matches[0].facility_id + + if len(matches) == 1: + matches[0].results['match_type'] = 'single_exact_match' + else: + matches[0].results['match_type'] = 'multiple_exact_matches' + + item.processing_results.append({ + 'action': ProcessingAction.MATCH, + 'started_at': started, + 'error': False, + 'finished_at': finished, + 'exact_match': True + }) + item.save() + + if item.source.create: + for m in matches: + m.save() + # TODO: handle PPE if needed + + all_matches.extend(matches) + + items = FacilityListItem.objects.filter(id__in=processed_list_item_ids) \ + .exclude(facility__isnull=True) + for item in items: + update_extendedfields_for_list_item(item) + + return all_matches diff --git a/src/django/api/views.py b/src/django/api/views.py index f47f4ddbf..7d43aadcd 100644 --- a/src/django/api/views.py +++ b/src/django/api/views.py @@ -69,8 +69,10 @@ FeatureGroups) from api.geocoding import geocode_address from api.matching import (match_item, + exact_match_item, text_match_item, - GazetteerCacheTimeoutError) + GazetteerCacheTimeoutError, + clean) from api.models import (FacilityList, FacilityListItem, FacilityClaim, @@ -96,6 +98,7 @@ parse_excel, get_country_code, save_match_details, + save_exact_match_details, reduce_matches) from api.serializers import (FacilityListSerializer, FacilityListItemSerializer, @@ -1206,7 +1209,9 @@ def create(self, request): raw_data=json.dumps(request.data), status=FacilityListItem.PARSED, name=name, + clean_name=clean(name), address=address, + clean_address=clean(address), country_code=country_code, ppe_product_types=ppe_product_types, ppe_contact_phone=ppe_contact_phone, @@ -1279,72 +1284,95 @@ def create(self, request): status=status.HTTP_500_INTERNAL_SERVER_ERROR) match_started = str(datetime.utcnow()) + try: - match_results = match_item(country_code, name, address, item.id) - item_matches = match_results['item_matches'] - - gazetteer_match_count = len(item_matches.keys()) - - if gazetteer_match_count == 0 and text_only_fallback: - # When testing with more realistic data the text matching - # was returning dozens of results. Limiting to the first 5 is - # reasonable because the results are sorted with the highest - # confidence first. - text_only_matches = { - item.id: - list(text_match_item(item.country_code, item.name)[:5])} + exact_match_results = exact_match_item(country_code, name, address, + request.user.contributor, + item.id) + item_matches = exact_match_results['item_matches'] + exact_match_count = len(item_matches.keys()) + if exact_match_count > 0: + match_objects = save_exact_match_details(exact_match_results) + for item_id, matches in item_matches.items(): + result['item_id'] = item_id + result['status'] = item.status + for m in matches: + facility_id = m.get('facility_id') + facility = Facility.objects.get(id=facility_id) + context = {'request': request} + facility_dict = FacilityDetailsSerializer( + facility, context=context).data + result['matches'].append(facility_dict) else: - text_only_matches = {} - - match_objects = save_match_details( - match_results, text_only_matches=text_only_matches) - - automatic_threshold = \ - match_results['results']['automatic_threshold'] - - for item_id, matches in item_matches.items(): - result['item_id'] = item_id - result['status'] = item.status - for (facility_id, score), match in zip(reduce_matches(matches), - match_objects): - facility = Facility.objects.get(id=facility_id) - context = {'request': request} - facility_dict = FacilityDetailsSerializer( - facility, context=context).data - # calling `round` alone was not trimming digits - facility_dict['confidence'] = float(str(round(score, 4))) - # If there is a single match for an item, it only needs to - # be confirmed if it has a low score. - if score < automatic_threshold or len(match_objects) > 1: - if should_create: - facility_dict['confirm_match_url'] = reverse( - 'facility-match-confirm', - kwargs={'pk': match.pk}) - facility_dict['reject_match_url'] = reverse( - 'facility-match-reject', - kwargs={'pk': match.pk}) - result['matches'].append(facility_dict) - - # Append the text only match results to the response if there were - # no gazetteer matches - if gazetteer_match_count == 0: - for match in match_objects: - if match.results and match.results['text_only_match']: - item.status = FacilityListItem.POTENTIAL_MATCH + match_results = match_item(country_code, name, address, + item.id) + item_matches = match_results['item_matches'] + + gazetteer_match_count = len(item_matches.keys()) + + if gazetteer_match_count == 0 and text_only_fallback: + # When testing with more realistic data the text matching + # was returning dozens of results. Limiting to the first 5 + # is reasonable because the results are sorted with the + # highest confidence first. + text_only_matches = { + item.id: + list(text_match_item(item.country_code, + item.name)[:5])} + else: + text_only_matches = {} + + match_objects = save_match_details( + match_results, text_only_matches=text_only_matches) + + automatic_threshold = \ + match_results['results']['automatic_threshold'] + + for item_id, matches in item_matches.items(): + result['item_id'] = item_id + result['status'] = item.status + for (facility_id, score), match in \ + zip(reduce_matches(matches), match_objects): + facility = Facility.objects.get(id=facility_id) context = {'request': request} facility_dict = FacilityDetailsSerializer( - match.facility, context=context).data - facility_dict['confidence'] = match.confidence - facility_dict['text_only_match'] = True - if should_create: - facility_dict['confirm_match_url'] = reverse( - 'facility-match-confirm', - kwargs={'pk': match.pk}) - facility_dict['reject_match_url'] = reverse( - 'facility-match-reject', - kwargs={'pk': match.pk}) + facility, context=context).data + # calling `round` alone was not trimming digits + facility_dict['confidence'] = float(str(round(score, + 4))) + # If there is a single match for an item, it only needs + # to be confirmed if it has a low score. + if score < automatic_threshold or \ + len(match_objects) > 1: + if should_create: + facility_dict['confirm_match_url'] = reverse( + 'facility-match-confirm', + kwargs={'pk': match.pk}) + facility_dict['reject_match_url'] = reverse( + 'facility-match-reject', + kwargs={'pk': match.pk}) result['matches'].append(facility_dict) + # Append the text only match results to the response if there + # were no gazetteer matches + if gazetteer_match_count == 0: + for match in match_objects: + if match.results and match.results['text_only_match']: + item.status = FacilityListItem.POTENTIAL_MATCH + context = {'request': request} + facility_dict = FacilityDetailsSerializer( + match.facility, context=context).data + facility_dict['confidence'] = match.confidence + facility_dict['text_only_match'] = True + if should_create: + facility_dict['confirm_match_url'] = reverse( + 'facility-match-confirm', + kwargs={'pk': match.pk}) + facility_dict['reject_match_url'] = reverse( + 'facility-match-reject', + kwargs={'pk': match.pk}) + result['matches'].append(facility_dict) + except GazetteerCacheTimeoutError as te: item.status = FacilityListItem.ERROR_MATCHING item.processing_results.append({