Skip to content
This repository has been archived by the owner on Feb 1, 2024. It is now read-only.

Commit

Permalink
Merge pull request #1099 from open-apparel-registry/feature/jcw/exten…
Browse files Browse the repository at this point in the history
…ded-matching
  • Loading branch information
jwalgran committed Sep 9, 2020
2 parents 96af587 + d7dc120 commit 22696df
Show file tree
Hide file tree
Showing 7 changed files with 181 additions and 5 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

### Added

- Add trigram match on facility name fallback when matching via API [#1099](https://github.com/open-apparel-registry/open-apparel-registry/pull/1099)

### Changed

### Deprecated
Expand Down
1 change: 1 addition & 0 deletions src/django/api/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class FacilityMergeQueryParams:
class FacilityCreateQueryParams:
CREATE = 'create'
PUBLIC = 'public'
TEXT_ONLY_FALLBACK = 'textonlyfallback'


class FeatureGroups:
Expand Down
23 changes: 23 additions & 0 deletions src/django/api/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from collections import defaultdict
from datetime import datetime
from django.conf import settings
from django.contrib.postgres.search import TrigramSimilarity
from django.db import transaction
from django.db.models import Q, Max
from unidecode import unidecode
Expand Down Expand Up @@ -391,6 +392,28 @@ def match_item(country,
recall_weight=recall_weight)


def text_match_item(country_code, name, threshold=0.5):
"""
Use simple fuzzy text matching rather than a dedupe model to find potential
matches.
Arguments:
country -- A valid 2-character ISO code.
name -- The name of the facility.
threshold -- Value between 0.0 and 1.0. The minimum acceptable similarity
score. Defaults to 0.5.
Returns:
A Facility QuerySet that containing items with a matching country code and
a name similar to the name argument.
"""
return Facility.objects \
.annotate(similarity=TrigramSimilarity('name', name)) \
.filter(similarity__gte=threshold,
country_code=country_code) \
.order_by('-similarity')


def facility_values_to_dedupe_record(facility_dict):
"""
Convert a dictionary with id, country, name, and address keys into a
Expand Down
15 changes: 15 additions & 0 deletions src/django/api/migrations/0047_install_trigram_extension.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Generated by Django 2.2.11 on 2020-09-08 18:40

from django.db import migrations
from django.contrib.postgres.operations import TrigramExtension


class Migration(migrations.Migration):

dependencies = [
('api', '0046_expand_ppe_product_types_and_phone'),
]

operations = [
TrigramExtension()
]
38 changes: 36 additions & 2 deletions src/django/api/processing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import csv
import traceback
import sys
Expand Down Expand Up @@ -291,7 +292,7 @@ def is_string_match(item, facility):
and clean(item.address) == clean(facility.address))


def save_match_details(match_results):
def save_match_details(match_results, text_only_matches=None):
"""
Save the results of a call to match_facility_list_items by creating
Facility and FacilityMatch instances and updating the state of the affected
Expand All @@ -303,10 +304,16 @@ def save_match_details(match_results):
Arguments:
match_results -- The dict return value from a call to
match_facility_list_items.
text_only_matches -- An optional object where keys are FacilityListItem
IDs and values are a list of facilities that were
matched to the item without dedupe which should be
saved as pending matches.
Returns:
The list of `FacilityMatch` objects created
"""
if text_only_matches is None:
text_only_matches = {}
processed_list_item_ids = match_results['processed_list_item_ids']
item_matches = match_results['item_matches']
results = match_results['results']
Expand Down Expand Up @@ -410,7 +417,34 @@ def make_pending_match(item_id, facility_id, score):
.filter(id__in=processed_list_item_ids)
.exclude(id__in=item_matches.keys()))
for item in unmatched:
if item.status == FacilityListItem.GEOCODED_NO_RESULTS:
has_text_only_matches = (
item.id in text_only_matches
and len(text_only_matches[item.id]) > 0)
if has_text_only_matches:
text_only_results = copy.deepcopy(results)
text_only_results['text_only_match'] = True
text_only_match_objects = [
FacilityMatch(
facility_list_item_id=item.id,
facility_id=facility.id,
confidence=0,
status=FacilityMatch.PENDING,
results=text_only_results)
for facility in text_only_matches[item.id]]
if item.source.create:
for m in text_only_match_objects:
m.save()
all_matches.extend(text_only_match_objects)

item.status = FacilityListItem.POTENTIAL_MATCH
item.processing_results.append({
'action': ProcessingAction.MATCH,
'started_at': started,
'error': False,
'text_only_match': True,
'finished_at': finished
})
elif item.status == FacilityListItem.GEOCODED_NO_RESULTS:
item.status = FacilityListItem.ERROR_MATCHING
item.processing_results.append({
'action': ProcessingAction.MATCH,
Expand Down
1 change: 1 addition & 0 deletions src/django/api/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,7 @@ def validate_country(self, value):
class FacilityCreateQueryParamsSerializer(Serializer):
create = BooleanField(default=True, required=False)
public = BooleanField(default=True, required=False)
textonlyfallback = BooleanField(default=False, required=False)


class FacilityClaimSerializer(ModelSerializer):
Expand Down
106 changes: 103 additions & 3 deletions src/django/api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,9 @@
UpdateLocationParams,
FeatureGroups)
from api.geocoding import geocode_address
from api.matching import match_item, GazetteerCacheTimeoutError
from api.matching import (match_item,
text_match_item,
GazetteerCacheTimeoutError)
from api.models import (FacilityList,
FacilityListItem,
FacilityClaim,
Expand Down Expand Up @@ -590,6 +592,17 @@ def get_schema_fields(self, view):
'created, the contributor will not be publicly '
'associated with the facility'),
),
coreapi.Field(
name='textonlyfallback',
location='query',
type='boolean',
required=False,
description=(
'If true and no confident matches were made then '
'attempt to make a text-only match of the facility '
'name. If more than 5 text matches are made only the '
'5 highest confidence results are returned'),
),
]

return []
Expand Down Expand Up @@ -917,6 +930,55 @@ def create(self, request):
}
### Potential Text Only Match
{
"matches": [
{
"id": "CN2019303BQ3FZP",
"type": "Feature",
"geometry": {
"type": "Point",
"coordinates": [
120.596047,
32.172013
]
},
"properties": {
"name": "Nantong Jackbeanie Headwear Garment Co. Ltd.",
"address": "No. 808, The Third Industry Park, Guoyuan Town, Rugao City Nantong",
"country_code": "CN",
"oar_id": "CN2019303BQ3FZP",
"other_names": [],
"other_addresses": [],
"contributors": [
{
"id": 4,
"name": "Researcher A (Summer 2019 Affiliate List)",
"is_verified": false
}
],
"country_name": "China",
"claim_info": null,
"other_locations": []
},
"confidence": 0,
"text_only_match": true
}
],
"item_id": 959,
"geocoded_geometry": {
"type": "Point",
"coordinates": [
120.596047,
32.172013
]
},
"geocoded_address": "Guoyuanzhen, Rugao, Nantong, Jiangsu, China",
"status": "POTENTIAL_MATCH"
}
### New Facility
{
Expand Down Expand Up @@ -965,6 +1027,8 @@ def create(self, request):
request._request, FeatureGroups.CAN_SUBMIT_PRIVATE_FACILITY)
if not public_submission and not private_allowed:
raise PermissionDenied('Cannot submit a private facility')
text_only_fallback = params_serializer.validated_data[
FacilityCreateQueryParams.TEXT_ONLY_FALLBACK]

parse_started = str(datetime.utcnow())

Expand Down Expand Up @@ -1066,12 +1130,27 @@ def create(self, request):
match_started = str(datetime.utcnow())
try:
match_results = match_item(country_code, name, address, item.id)
match_objects = save_match_details(match_results)
item_matches = match_results['item_matches']

gazetteer_match_count = len(item_matches.keys())

if gazetteer_match_count == 0 and text_only_fallback:
# When testing with more realistic data the text matching
# was returning dozens of results. Limiting to the first 5 is
# reasonable because the results are sorted with the highest
# confidence first.
text_only_matches = {
item.id:
list(text_match_item(item.country_code, item.name)[:5])}
else:
text_only_matches = {}

match_objects = save_match_details(
match_results, text_only_matches=text_only_matches)

automatic_threshold = \
match_results['results']['automatic_threshold']

item_matches = match_results['item_matches']
for item_id, matches in item_matches.items():
result['item_id'] = item_id
result['status'] = item.status
Expand All @@ -1092,6 +1171,27 @@ def create(self, request):
'facility-match-reject',
kwargs={'pk': match.pk})
result['matches'].append(facility_dict)

# Append the text only match results to the response if there were
# no gazetteer matches
if gazetteer_match_count == 0:
for match in match_objects:
if match.results and match.results['text_only_match']:
item.status = FacilityListItem.POTENTIAL_MATCH
context = {'request': request}
facility_dict = FacilityDetailsSerializer(
match.facility, context=context).data
facility_dict['confidence'] = match.confidence
facility_dict['text_only_match'] = True
if should_create:
facility_dict['confirm_match_url'] = reverse(
'facility-match-confirm',
kwargs={'pk': match.pk})
facility_dict['reject_match_url'] = reverse(
'facility-match-reject',
kwargs={'pk': match.pk})
result['matches'].append(facility_dict)

except GazetteerCacheTimeoutError as te:
item.status = FacilityListItem.ERROR_MATCHING
item.processing_results.append({
Expand Down

0 comments on commit 22696df

Please sign in to comment.