Skip to content
This repository has been archived by the owner on Feb 1, 2024. It is now read-only.

Commit

Permalink
Identify exact matches pre-dedupe
Browse files Browse the repository at this point in the history
When matching facilities, look for exact matches prior to entering
the dedupe process. Exact matches (for name, address, and country)
can be automatically matched, and don't need to enter the dedupe
process.
  • Loading branch information
TaiWilkin committed Jan 5, 2022
1 parent 8087fba commit f567218
Show file tree
Hide file tree
Showing 6 changed files with 293 additions and 67 deletions.
15 changes: 10 additions & 5 deletions src/django/api/management/commands/batch_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@

from api.constants import ProcessingAction
from api.models import FacilityList, FacilityListItem
from api.matching import match_facility_list_items
from api.matching import match_facility_list_items, identify_exact_matches
from api.processing import (parse_facility_list_item,
geocode_facility_list_item,
save_match_details)
save_match_details,
save_exact_match_details)
from api.mail import notify_facility_list_complete

LINE_ITEM_ACTIONS = {
Expand Down Expand Up @@ -69,13 +70,17 @@ def handle(self, *args, **options):
total_item_count = \
facility_list.source.facilitylistitem_set.count()

result = match_facility_list_items(facility_list)
success_count = len(result['processed_list_item_ids'])
fail_count = total_item_count - success_count
exact_result = identify_exact_matches(facility_list)
with transaction.atomic():
save_exact_match_details(exact_result)

result = match_facility_list_items(facility_list)
with transaction.atomic():
save_match_details(result)

success_count = len(result['processed_list_item_ids']) + \
len(exact_result['processed_list_item_ids'])
fail_count = total_item_count - success_count
if success_count > 0:
self.stdout.write(
self.style.SUCCESS(
Expand Down
68 changes: 67 additions & 1 deletion src/django/api/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from django.conf import settings
from django.contrib.postgres.search import TrigramSimilarity
from django.db import transaction
from django.db.models import Q, Max
from django.db.models import Q, Max, ExpressionWrapper, BooleanField
from unidecode import unidecode

from api.models import (Facility,
Expand Down Expand Up @@ -111,6 +111,72 @@ def get_canonical_items():
return items


def exact_match_items(messy, contributor):
started = str(datetime.utcnow())

matched_items = FacilityListItem.objects \
.filter(status__in=[FacilityListItem.MATCHED,
FacilityListItem.CONFIRMED_MATCH])
active_item_ids = FacilityMatch.objects \
.filter(status__in=[FacilityMatch.AUTOMATIC,
FacilityMatch.CONFIRMED,
FacilityMatch.MERGED],
is_active=True,
facility_list_item__source__is_active=True) \
.values_list('facility_list_item')

results = dict()

for messy_id, item in messy.items():
clean_name = clean(item.get('name', ''))
clean_address = clean(item.get('address', ''))
country_code = item.get('country').upper()
exact_matches = matched_items.filter(clean_name=clean_name,
clean_address=clean_address,
country_code=country_code) \
.annotate(is_active=ExpressionWrapper(
Q(id__in=active_item_ids),
output_field=BooleanField())) \
.annotate(has_same_contributor=ExpressionWrapper(
Q(source__contributor=contributor),
output_field=BooleanField())) \
.order_by('-is_active',
'-has_same_contributor',
'updated_at') \
.values('facility_id')

if len(exact_matches) > 0:
results[messy_id] = exact_matches

finished = str(datetime.utcnow())

return {
'processed_list_item_ids': list(results.keys()),
'item_matches': results,
'started': started,
'finished': finished
}


def identify_exact_matches(facility_list):
messy = get_messy_items_from_facility_list(facility_list)
contributor = facility_list.source.contributor

return exact_match_items(messy, contributor)


def exact_match_item(country, name, address, contributor, id='id'):
return exact_match_items(
{
str(id): {
"country": clean(country),
"name": clean(name),
"address": clean(address)
}
},
contributor)


def get_messy_items_from_facility_list(facility_list):
"""
Fetch all `FacilityListItem` objects that belong to the specified
Expand Down
41 changes: 41 additions & 0 deletions src/django/api/migrations/0075_add_fli_clean_name_and_address.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Generated by Django 2.2.24 on 2021-12-27 19:40

from django.db import migrations, models
from api.matching import clean


def populate_cleaned_fields(apps, schema_editor):
FacilityListItem = apps.get_model('api', 'FacilityListItem')
for list_item in FacilityListItem.objects.all():
list_item.clean_name = clean(list_item.name)
list_item.clean_address = clean(list_item.address)
list_item.save()


def do_nothing_on_reverse(apps, schema_editor):
pass


class Migration(migrations.Migration):

dependencies = [
('api', '0074_embedconfig_prefer_contributor_name'),
]

operations = [
migrations.AddField(
model_name='facilitylistitem',
name='clean_address',
field=models.CharField(default='', help_text='The cleaned address of the facility.', max_length=200),
),
migrations.AddField(
model_name='facilitylistitem',
name='clean_name',
field=models.CharField(default='', help_text='The cleaned name of the facility.', max_length=200),
),
migrations.AddIndex(
model_name='facilitylistitem',
index=models.Index(fields=['country_code', 'clean_name', 'clean_address'], name='api_fli_match_fields_idx'),
),
migrations.RunPython(populate_cleaned_fields, do_nothing_on_reverse)
]
15 changes: 15 additions & 0 deletions src/django/api/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,9 @@ class Meta:
indexes = [
models.Index(fields=['source', 'row_index'],
name='api_fli_facility_list_row_idx'),
models.Index(fields=['country_code', 'clean_name',
'clean_address'],
name='api_fli_match_fields_idx')
]

source = models.ForeignKey(
Expand Down Expand Up @@ -621,6 +624,18 @@ class Meta:
help_text=('The facility created from this list item or the '
'previously existing facility to which this list '
'item was matched.'))
clean_name = models.CharField(
max_length=200,
null=False,
blank=False,
default='',
help_text='The cleaned name of the facility.')
clean_address = models.CharField(
max_length=200,
null=False,
blank=False,
default='',
help_text='The cleaned address of the facility.')
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)

Expand Down
71 changes: 71 additions & 0 deletions src/django/api/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,10 @@ def parse_facility_list_item(item):
values[fields.index(CsvHeaderField.COUNTRY)])
if CsvHeaderField.NAME in fields:
item.name = values[fields.index(CsvHeaderField.NAME)]
item.clean_name = clean(item.name)
if CsvHeaderField.ADDRESS in fields:
item.address = values[fields.index(CsvHeaderField.ADDRESS)]
item.clean_address = clean(item.address)
if CsvHeaderField.LAT in fields and CsvHeaderField.LNG in fields:
lat = float(values[fields.index(CsvHeaderField.LAT)])
lng = float(values[fields.index(CsvHeaderField.LNG)])
Expand Down Expand Up @@ -492,3 +494,72 @@ def make_pending_match(item_id, facility_id, score):
update_extendedfields_for_list_item(item)

return all_matches


def save_exact_match_details(exact_results):
"""
Save the results of a call to identify_exact_matches by creating
Facility and FacilityMatch instances and updating the state of the affected
FacilityListItems.
Should be called in a transaction to ensure that all the updates are
applied atomically.
Arguments:
exact_results -- The dict return value from a call to
identify_exact_matches
Returns:
The list of `FacilityMatch` objects created
"""
processed_list_item_ids = exact_results['processed_list_item_ids']
item_matches = exact_results['item_matches']
started = exact_results['started']
finished = exact_results['finished']

def make_pending_match(item_id, facility_id):
return FacilityMatch(
facility_list_item_id=item_id,
facility_id=facility_id,
confidence=1.0,
status=FacilityMatch.PENDING,
results={})

all_matches = []
for item_id, exact_matches in item_matches.items():
item = FacilityListItem.objects.get(id=item_id)
item.status = FacilityListItem.POTENTIAL_MATCH

matches = [make_pending_match(item_id, m.get('facility_id'))
for m in exact_matches]
matches[0].status = FacilityMatch.AUTOMATIC
item.status = FacilityListItem.MATCHED
item.facility_id = matches[0].facility_id

if len(matches) == 1:
matches[0].results['match_type'] = 'single_exact_match'
else:
matches[0].results['match_type'] = 'multiple_exact_matches'

item.processing_results.append({
'action': ProcessingAction.MATCH,
'started_at': started,
'error': False,
'finished_at': finished,
'exact_match': True
})
item.save()

if item.source.create:
for m in matches:
m.save()
# TODO: handle PPE if needed

all_matches.extend(matches)

items = FacilityListItem.objects.filter(id__in=processed_list_item_ids) \
.exclude(facility__isnull=True)
for item in items:
update_extendedfields_for_list_item(item)

return all_matches
Loading

0 comments on commit f567218

Please sign in to comment.