From f5672188ada97f71a9252f46a437efae90482724 Mon Sep 17 00:00:00 2001 From: Tai Wilkin-Corraggio Date: Wed, 29 Dec 2021 11:11:54 -0500 Subject: [PATCH] Identify exact matches pre-dedupe When matching facilities, look for exact matches prior to entering the dedupe process. Exact matches (for name, address, and country) can be automatically matched, and don't need to enter the dedupe process. --- .../api/management/commands/batch_process.py | 15 +- src/django/api/matching.py | 68 +++++++- .../0075_add_fli_clean_name_and_address.py | 41 +++++ src/django/api/models.py | 15 ++ src/django/api/processing.py | 71 +++++++++ src/django/api/views.py | 150 +++++++++++------- 6 files changed, 293 insertions(+), 67 deletions(-) create mode 100644 src/django/api/migrations/0075_add_fli_clean_name_and_address.py diff --git a/src/django/api/management/commands/batch_process.py b/src/django/api/management/commands/batch_process.py index ea59bb8de..0907cd923 100644 --- a/src/django/api/management/commands/batch_process.py +++ b/src/django/api/management/commands/batch_process.py @@ -6,10 +6,11 @@ from api.constants import ProcessingAction from api.models import FacilityList, FacilityListItem -from api.matching import match_facility_list_items +from api.matching import match_facility_list_items, identify_exact_matches from api.processing import (parse_facility_list_item, geocode_facility_list_item, - save_match_details) + save_match_details, + save_exact_match_details) from api.mail import notify_facility_list_complete LINE_ITEM_ACTIONS = { @@ -69,13 +70,17 @@ def handle(self, *args, **options): total_item_count = \ facility_list.source.facilitylistitem_set.count() - result = match_facility_list_items(facility_list) - success_count = len(result['processed_list_item_ids']) - fail_count = total_item_count - success_count + exact_result = identify_exact_matches(facility_list) + with transaction.atomic(): + save_exact_match_details(exact_result) + result = match_facility_list_items(facility_list) with transaction.atomic(): save_match_details(result) + success_count = len(result['processed_list_item_ids']) + \ + len(exact_result['processed_list_item_ids']) + fail_count = total_item_count - success_count if success_count > 0: self.stdout.write( self.style.SUCCESS( diff --git a/src/django/api/matching.py b/src/django/api/matching.py index 1e461b2d3..86cd4a32d 100644 --- a/src/django/api/matching.py +++ b/src/django/api/matching.py @@ -10,7 +10,7 @@ from django.conf import settings from django.contrib.postgres.search import TrigramSimilarity from django.db import transaction -from django.db.models import Q, Max +from django.db.models import Q, Max, ExpressionWrapper, BooleanField from unidecode import unidecode from api.models import (Facility, @@ -111,6 +111,72 @@ def get_canonical_items(): return items +def exact_match_items(messy, contributor): + started = str(datetime.utcnow()) + + matched_items = FacilityListItem.objects \ + .filter(status__in=[FacilityListItem.MATCHED, + FacilityListItem.CONFIRMED_MATCH]) + active_item_ids = FacilityMatch.objects \ + .filter(status__in=[FacilityMatch.AUTOMATIC, + FacilityMatch.CONFIRMED, + FacilityMatch.MERGED], + is_active=True, + facility_list_item__source__is_active=True) \ + .values_list('facility_list_item') + + results = dict() + + for messy_id, item in messy.items(): + clean_name = clean(item.get('name', '')) + clean_address = clean(item.get('address', '')) + country_code = item.get('country').upper() + exact_matches = matched_items.filter(clean_name=clean_name, + clean_address=clean_address, + country_code=country_code) \ + .annotate(is_active=ExpressionWrapper( + Q(id__in=active_item_ids), + output_field=BooleanField())) \ + .annotate(has_same_contributor=ExpressionWrapper( + Q(source__contributor=contributor), + output_field=BooleanField())) \ + .order_by('-is_active', + '-has_same_contributor', + 'updated_at') \ + .values('facility_id') + + if len(exact_matches) > 0: + results[messy_id] = exact_matches + + finished = str(datetime.utcnow()) + + return { + 'processed_list_item_ids': list(results.keys()), + 'item_matches': results, + 'started': started, + 'finished': finished + } + + +def identify_exact_matches(facility_list): + messy = get_messy_items_from_facility_list(facility_list) + contributor = facility_list.source.contributor + + return exact_match_items(messy, contributor) + + +def exact_match_item(country, name, address, contributor, id='id'): + return exact_match_items( + { + str(id): { + "country": clean(country), + "name": clean(name), + "address": clean(address) + } + }, + contributor) + + def get_messy_items_from_facility_list(facility_list): """ Fetch all `FacilityListItem` objects that belong to the specified diff --git a/src/django/api/migrations/0075_add_fli_clean_name_and_address.py b/src/django/api/migrations/0075_add_fli_clean_name_and_address.py new file mode 100644 index 000000000..5bcc110d3 --- /dev/null +++ b/src/django/api/migrations/0075_add_fli_clean_name_and_address.py @@ -0,0 +1,41 @@ +# Generated by Django 2.2.24 on 2021-12-27 19:40 + +from django.db import migrations, models +from api.matching import clean + + +def populate_cleaned_fields(apps, schema_editor): + FacilityListItem = apps.get_model('api', 'FacilityListItem') + for list_item in FacilityListItem.objects.all(): + list_item.clean_name = clean(list_item.name) + list_item.clean_address = clean(list_item.address) + list_item.save() + + +def do_nothing_on_reverse(apps, schema_editor): + pass + + +class Migration(migrations.Migration): + + dependencies = [ + ('api', '0074_embedconfig_prefer_contributor_name'), + ] + + operations = [ + migrations.AddField( + model_name='facilitylistitem', + name='clean_address', + field=models.CharField(default='', help_text='The cleaned address of the facility.', max_length=200), + ), + migrations.AddField( + model_name='facilitylistitem', + name='clean_name', + field=models.CharField(default='', help_text='The cleaned name of the facility.', max_length=200), + ), + migrations.AddIndex( + model_name='facilitylistitem', + index=models.Index(fields=['country_code', 'clean_name', 'clean_address'], name='api_fli_match_fields_idx'), + ), + migrations.RunPython(populate_cleaned_fields, do_nothing_on_reverse) + ] diff --git a/src/django/api/models.py b/src/django/api/models.py index 39d6181cf..707ae696b 100644 --- a/src/django/api/models.py +++ b/src/django/api/models.py @@ -552,6 +552,9 @@ class Meta: indexes = [ models.Index(fields=['source', 'row_index'], name='api_fli_facility_list_row_idx'), + models.Index(fields=['country_code', 'clean_name', + 'clean_address'], + name='api_fli_match_fields_idx') ] source = models.ForeignKey( @@ -621,6 +624,18 @@ class Meta: help_text=('The facility created from this list item or the ' 'previously existing facility to which this list ' 'item was matched.')) + clean_name = models.CharField( + max_length=200, + null=False, + blank=False, + default='', + help_text='The cleaned name of the facility.') + clean_address = models.CharField( + max_length=200, + null=False, + blank=False, + default='', + help_text='The cleaned address of the facility.') created_at = models.DateTimeField(auto_now_add=True) updated_at = models.DateTimeField(auto_now=True) diff --git a/src/django/api/processing.py b/src/django/api/processing.py index c268464df..2c761548d 100644 --- a/src/django/api/processing.py +++ b/src/django/api/processing.py @@ -115,8 +115,10 @@ def parse_facility_list_item(item): values[fields.index(CsvHeaderField.COUNTRY)]) if CsvHeaderField.NAME in fields: item.name = values[fields.index(CsvHeaderField.NAME)] + item.clean_name = clean(item.name) if CsvHeaderField.ADDRESS in fields: item.address = values[fields.index(CsvHeaderField.ADDRESS)] + item.clean_address = clean(item.address) if CsvHeaderField.LAT in fields and CsvHeaderField.LNG in fields: lat = float(values[fields.index(CsvHeaderField.LAT)]) lng = float(values[fields.index(CsvHeaderField.LNG)]) @@ -492,3 +494,72 @@ def make_pending_match(item_id, facility_id, score): update_extendedfields_for_list_item(item) return all_matches + + +def save_exact_match_details(exact_results): + """ + Save the results of a call to identify_exact_matches by creating + Facility and FacilityMatch instances and updating the state of the affected + FacilityListItems. + + Should be called in a transaction to ensure that all the updates are + applied atomically. + + Arguments: + exact_results -- The dict return value from a call to + identify_exact_matches + + Returns: + The list of `FacilityMatch` objects created + """ + processed_list_item_ids = exact_results['processed_list_item_ids'] + item_matches = exact_results['item_matches'] + started = exact_results['started'] + finished = exact_results['finished'] + + def make_pending_match(item_id, facility_id): + return FacilityMatch( + facility_list_item_id=item_id, + facility_id=facility_id, + confidence=1.0, + status=FacilityMatch.PENDING, + results={}) + + all_matches = [] + for item_id, exact_matches in item_matches.items(): + item = FacilityListItem.objects.get(id=item_id) + item.status = FacilityListItem.POTENTIAL_MATCH + + matches = [make_pending_match(item_id, m.get('facility_id')) + for m in exact_matches] + matches[0].status = FacilityMatch.AUTOMATIC + item.status = FacilityListItem.MATCHED + item.facility_id = matches[0].facility_id + + if len(matches) == 1: + matches[0].results['match_type'] = 'single_exact_match' + else: + matches[0].results['match_type'] = 'multiple_exact_matches' + + item.processing_results.append({ + 'action': ProcessingAction.MATCH, + 'started_at': started, + 'error': False, + 'finished_at': finished, + 'exact_match': True + }) + item.save() + + if item.source.create: + for m in matches: + m.save() + # TODO: handle PPE if needed + + all_matches.extend(matches) + + items = FacilityListItem.objects.filter(id__in=processed_list_item_ids) \ + .exclude(facility__isnull=True) + for item in items: + update_extendedfields_for_list_item(item) + + return all_matches diff --git a/src/django/api/views.py b/src/django/api/views.py index f47f4ddbf..7d43aadcd 100644 --- a/src/django/api/views.py +++ b/src/django/api/views.py @@ -69,8 +69,10 @@ FeatureGroups) from api.geocoding import geocode_address from api.matching import (match_item, + exact_match_item, text_match_item, - GazetteerCacheTimeoutError) + GazetteerCacheTimeoutError, + clean) from api.models import (FacilityList, FacilityListItem, FacilityClaim, @@ -96,6 +98,7 @@ parse_excel, get_country_code, save_match_details, + save_exact_match_details, reduce_matches) from api.serializers import (FacilityListSerializer, FacilityListItemSerializer, @@ -1206,7 +1209,9 @@ def create(self, request): raw_data=json.dumps(request.data), status=FacilityListItem.PARSED, name=name, + clean_name=clean(name), address=address, + clean_address=clean(address), country_code=country_code, ppe_product_types=ppe_product_types, ppe_contact_phone=ppe_contact_phone, @@ -1279,72 +1284,95 @@ def create(self, request): status=status.HTTP_500_INTERNAL_SERVER_ERROR) match_started = str(datetime.utcnow()) + try: - match_results = match_item(country_code, name, address, item.id) - item_matches = match_results['item_matches'] - - gazetteer_match_count = len(item_matches.keys()) - - if gazetteer_match_count == 0 and text_only_fallback: - # When testing with more realistic data the text matching - # was returning dozens of results. Limiting to the first 5 is - # reasonable because the results are sorted with the highest - # confidence first. - text_only_matches = { - item.id: - list(text_match_item(item.country_code, item.name)[:5])} + exact_match_results = exact_match_item(country_code, name, address, + request.user.contributor, + item.id) + item_matches = exact_match_results['item_matches'] + exact_match_count = len(item_matches.keys()) + if exact_match_count > 0: + match_objects = save_exact_match_details(exact_match_results) + for item_id, matches in item_matches.items(): + result['item_id'] = item_id + result['status'] = item.status + for m in matches: + facility_id = m.get('facility_id') + facility = Facility.objects.get(id=facility_id) + context = {'request': request} + facility_dict = FacilityDetailsSerializer( + facility, context=context).data + result['matches'].append(facility_dict) else: - text_only_matches = {} - - match_objects = save_match_details( - match_results, text_only_matches=text_only_matches) - - automatic_threshold = \ - match_results['results']['automatic_threshold'] - - for item_id, matches in item_matches.items(): - result['item_id'] = item_id - result['status'] = item.status - for (facility_id, score), match in zip(reduce_matches(matches), - match_objects): - facility = Facility.objects.get(id=facility_id) - context = {'request': request} - facility_dict = FacilityDetailsSerializer( - facility, context=context).data - # calling `round` alone was not trimming digits - facility_dict['confidence'] = float(str(round(score, 4))) - # If there is a single match for an item, it only needs to - # be confirmed if it has a low score. - if score < automatic_threshold or len(match_objects) > 1: - if should_create: - facility_dict['confirm_match_url'] = reverse( - 'facility-match-confirm', - kwargs={'pk': match.pk}) - facility_dict['reject_match_url'] = reverse( - 'facility-match-reject', - kwargs={'pk': match.pk}) - result['matches'].append(facility_dict) - - # Append the text only match results to the response if there were - # no gazetteer matches - if gazetteer_match_count == 0: - for match in match_objects: - if match.results and match.results['text_only_match']: - item.status = FacilityListItem.POTENTIAL_MATCH + match_results = match_item(country_code, name, address, + item.id) + item_matches = match_results['item_matches'] + + gazetteer_match_count = len(item_matches.keys()) + + if gazetteer_match_count == 0 and text_only_fallback: + # When testing with more realistic data the text matching + # was returning dozens of results. Limiting to the first 5 + # is reasonable because the results are sorted with the + # highest confidence first. + text_only_matches = { + item.id: + list(text_match_item(item.country_code, + item.name)[:5])} + else: + text_only_matches = {} + + match_objects = save_match_details( + match_results, text_only_matches=text_only_matches) + + automatic_threshold = \ + match_results['results']['automatic_threshold'] + + for item_id, matches in item_matches.items(): + result['item_id'] = item_id + result['status'] = item.status + for (facility_id, score), match in \ + zip(reduce_matches(matches), match_objects): + facility = Facility.objects.get(id=facility_id) context = {'request': request} facility_dict = FacilityDetailsSerializer( - match.facility, context=context).data - facility_dict['confidence'] = match.confidence - facility_dict['text_only_match'] = True - if should_create: - facility_dict['confirm_match_url'] = reverse( - 'facility-match-confirm', - kwargs={'pk': match.pk}) - facility_dict['reject_match_url'] = reverse( - 'facility-match-reject', - kwargs={'pk': match.pk}) + facility, context=context).data + # calling `round` alone was not trimming digits + facility_dict['confidence'] = float(str(round(score, + 4))) + # If there is a single match for an item, it only needs + # to be confirmed if it has a low score. + if score < automatic_threshold or \ + len(match_objects) > 1: + if should_create: + facility_dict['confirm_match_url'] = reverse( + 'facility-match-confirm', + kwargs={'pk': match.pk}) + facility_dict['reject_match_url'] = reverse( + 'facility-match-reject', + kwargs={'pk': match.pk}) result['matches'].append(facility_dict) + # Append the text only match results to the response if there + # were no gazetteer matches + if gazetteer_match_count == 0: + for match in match_objects: + if match.results and match.results['text_only_match']: + item.status = FacilityListItem.POTENTIAL_MATCH + context = {'request': request} + facility_dict = FacilityDetailsSerializer( + match.facility, context=context).data + facility_dict['confidence'] = match.confidence + facility_dict['text_only_match'] = True + if should_create: + facility_dict['confirm_match_url'] = reverse( + 'facility-match-confirm', + kwargs={'pk': match.pk}) + facility_dict['reject_match_url'] = reverse( + 'facility-match-reject', + kwargs={'pk': match.pk}) + result['matches'].append(facility_dict) + except GazetteerCacheTimeoutError as te: item.status = FacilityListItem.ERROR_MATCHING item.processing_results.append({