Skip to content
This repository has been archived by the owner on Feb 1, 2024. It is now read-only.

Identify exact matches pre-dedupe #1568

Merged
merged 1 commit into from
Jan 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

### Changed

- Identify exact matches pre-dedupe [#1568](https://github.com/open-apparel-registry/open-apparel-registry/pull/1568)

### Deprecated

### Removed
Expand Down
15 changes: 10 additions & 5 deletions src/django/api/management/commands/batch_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@

from api.constants import ProcessingAction
from api.models import FacilityList, FacilityListItem
from api.matching import match_facility_list_items
from api.matching import match_facility_list_items, identify_exact_matches
from api.processing import (parse_facility_list_item,
geocode_facility_list_item,
save_match_details)
save_match_details,
save_exact_match_details)
from api.mail import notify_facility_list_complete

LINE_ITEM_ACTIONS = {
Expand Down Expand Up @@ -69,13 +70,17 @@ def handle(self, *args, **options):
total_item_count = \
facility_list.source.facilitylistitem_set.count()

result = match_facility_list_items(facility_list)
success_count = len(result['processed_list_item_ids'])
fail_count = total_item_count - success_count
exact_result = identify_exact_matches(facility_list)
with transaction.atomic():
save_exact_match_details(exact_result)

result = match_facility_list_items(facility_list)
with transaction.atomic():
save_match_details(result)

success_count = len(result['processed_list_item_ids']) + \
len(exact_result['processed_list_item_ids'])
fail_count = total_item_count - success_count
if success_count > 0:
self.stdout.write(
self.style.SUCCESS(
Expand Down
68 changes: 67 additions & 1 deletion src/django/api/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from django.conf import settings
from django.contrib.postgres.search import TrigramSimilarity
from django.db import transaction
from django.db.models import Q, Max
from django.db.models import Q, Max, ExpressionWrapper, BooleanField
from unidecode import unidecode

from api.models import (Facility,
Expand Down Expand Up @@ -111,6 +111,72 @@ def get_canonical_items():
return items


def exact_match_items(messy, contributor):
started = str(datetime.utcnow())

matched_items = FacilityListItem.objects \
.filter(status__in=[FacilityListItem.MATCHED,
FacilityListItem.CONFIRMED_MATCH])
active_item_ids = FacilityMatch.objects \
.filter(status__in=[FacilityMatch.AUTOMATIC,
FacilityMatch.CONFIRMED,
FacilityMatch.MERGED],
is_active=True,
facility_list_item__source__is_active=True) \
.values_list('facility_list_item')

results = dict()

for messy_id, item in messy.items():
clean_name = clean(item.get('name', ''))
clean_address = clean(item.get('address', ''))
country_code = item.get('country').upper()
exact_matches = matched_items.filter(clean_name=clean_name,
clean_address=clean_address,
country_code=country_code) \
.annotate(is_active=ExpressionWrapper(
Q(id__in=active_item_ids),
output_field=BooleanField())) \
.annotate(has_same_contributor=ExpressionWrapper(
Q(source__contributor=contributor),
output_field=BooleanField())) \
.order_by('-is_active',
'-has_same_contributor',
'updated_at') \
.values('facility_id')

if len(exact_matches) > 0:
results[messy_id] = exact_matches

finished = str(datetime.utcnow())

return {
'processed_list_item_ids': list(results.keys()),
'item_matches': results,
'started': started,
'finished': finished
}


def identify_exact_matches(facility_list):
messy = get_messy_items_from_facility_list(facility_list)
contributor = facility_list.source.contributor

return exact_match_items(messy, contributor)


def exact_match_item(country, name, address, contributor, id='id'):
return exact_match_items(
{
str(id): {
"country": clean(country),
"name": clean(name),
"address": clean(address)
}
},
contributor)


def get_messy_items_from_facility_list(facility_list):
"""
Fetch all `FacilityListItem` objects that belong to the specified
Expand Down
41 changes: 41 additions & 0 deletions src/django/api/migrations/0075_add_fli_clean_name_and_address.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Generated by Django 2.2.24 on 2021-12-27 19:40

from django.db import migrations, models
from api.matching import clean


def populate_cleaned_fields(apps, schema_editor):
FacilityListItem = apps.get_model('api', 'FacilityListItem')
for list_item in FacilityListItem.objects.all():
list_item.clean_name = clean(list_item.name)
list_item.clean_address = clean(list_item.address)
list_item.save()


def do_nothing_on_reverse(apps, schema_editor):
pass


class Migration(migrations.Migration):

dependencies = [
('api', '0074_embedconfig_prefer_contributor_name'),
]

operations = [
migrations.AddField(
model_name='facilitylistitem',
name='clean_address',
field=models.CharField(default='', help_text='The cleaned address of the facility.', max_length=200),
),
migrations.AddField(
model_name='facilitylistitem',
name='clean_name',
field=models.CharField(default='', help_text='The cleaned name of the facility.', max_length=200),
),
migrations.AddIndex(
model_name='facilitylistitem',
index=models.Index(fields=['country_code', 'clean_name', 'clean_address'], name='api_fli_match_fields_idx'),
),
migrations.RunPython(populate_cleaned_fields, do_nothing_on_reverse)
]
15 changes: 15 additions & 0 deletions src/django/api/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,9 @@ class Meta:
indexes = [
models.Index(fields=['source', 'row_index'],
name='api_fli_facility_list_row_idx'),
models.Index(fields=['country_code', 'clean_name',
'clean_address'],
name='api_fli_match_fields_idx')
]

source = models.ForeignKey(
Expand Down Expand Up @@ -621,6 +624,18 @@ class Meta:
help_text=('The facility created from this list item or the '
'previously existing facility to which this list '
'item was matched.'))
clean_name = models.CharField(
max_length=200,
null=False,
blank=False,
default='',
help_text='The cleaned name of the facility.')
clean_address = models.CharField(
max_length=200,
null=False,
blank=False,
default='',
help_text='The cleaned address of the facility.')
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)

Expand Down
71 changes: 71 additions & 0 deletions src/django/api/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,10 @@ def parse_facility_list_item(item):
values[fields.index(CsvHeaderField.COUNTRY)])
if CsvHeaderField.NAME in fields:
item.name = values[fields.index(CsvHeaderField.NAME)]
item.clean_name = clean(item.name)
if CsvHeaderField.ADDRESS in fields:
item.address = values[fields.index(CsvHeaderField.ADDRESS)]
item.clean_address = clean(item.address)
if CsvHeaderField.LAT in fields and CsvHeaderField.LNG in fields:
lat = float(values[fields.index(CsvHeaderField.LAT)])
lng = float(values[fields.index(CsvHeaderField.LNG)])
Expand Down Expand Up @@ -492,3 +494,72 @@ def make_pending_match(item_id, facility_id, score):
update_extendedfields_for_list_item(item)

return all_matches


def save_exact_match_details(exact_results):
"""
Save the results of a call to identify_exact_matches by creating
Facility and FacilityMatch instances and updating the state of the affected
FacilityListItems.

Should be called in a transaction to ensure that all the updates are
applied atomically.

Arguments:
exact_results -- The dict return value from a call to
identify_exact_matches

Returns:
The list of `FacilityMatch` objects created
"""
processed_list_item_ids = exact_results['processed_list_item_ids']
item_matches = exact_results['item_matches']
started = exact_results['started']
finished = exact_results['finished']

def make_pending_match(item_id, facility_id):
return FacilityMatch(
facility_list_item_id=item_id,
facility_id=facility_id,
confidence=1.0,
status=FacilityMatch.PENDING,
results={})

all_matches = []
for item_id, exact_matches in item_matches.items():
item = FacilityListItem.objects.get(id=item_id)
item.status = FacilityListItem.POTENTIAL_MATCH

matches = [make_pending_match(item_id, m.get('facility_id'))
for m in exact_matches]
Comment on lines +533 to +534
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After running through the testing instructions, the last list item ended up with 3 match rows. I originally thought this was a problem, but I confirmed that this matches the behavior of the normal deduple process where we would create matches for both a 60% and 90% confidence match but the high quality match would be the AUTOMATIC winner. 👍

openapparelregistry=# select id, results, status, facility_id, facility_list_item_id from api_facilitymatch where facility_list_item_id = 935;
 id  |                 results                  |  status   |   facility_id   | facility_list_item_id
-----+------------------------------------------+-----------+-----------------+-----------------------
 937 | {"match_type": "multiple_exact_matches"} | AUTOMATIC | BD202200698XDZC |                   935
 938 | {}                                       | PENDING   | US202200661FFPF |                   935
 939 | {}                                       | PENDING   | US202200661FFPF |                   935
(3 rows)

matches[0].status = FacilityMatch.AUTOMATIC
item.status = FacilityListItem.MATCHED
item.facility_id = matches[0].facility_id

if len(matches) == 1:
matches[0].results['match_type'] = 'single_exact_match'
else:
matches[0].results['match_type'] = 'multiple_exact_matches'

item.processing_results.append({
'action': ProcessingAction.MATCH,
'started_at': started,
'error': False,
'finished_at': finished,
'exact_match': True
})
item.save()

if item.source.create:
for m in matches:
m.save()
# TODO: handle PPE if needed

all_matches.extend(matches)

items = FacilityListItem.objects.filter(id__in=processed_list_item_ids) \
.exclude(facility__isnull=True)
for item in items:
update_extendedfields_for_list_item(item)

return all_matches