Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split detailed comparisons in household inference #56

Merged
merged 1 commit into from
Apr 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions households.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def parse_source_file(source_file, debug=False):
# break out the address into number, street, suffix, etc,
# so we can prefilter matches based on those
addr_cols = df.apply(
lambda row: addr_parse(row.household_street_address),
explode_address,
axis="columns",
result_type="expand",
)
Expand All @@ -126,15 +126,29 @@ def parse_source_file(source_file, debug=False):
return df


def explode_address(row):
# this addr_parse function is relatively slow so only run it once per row.
# by caching the exploded dict this way we ensure
# that we have it in the right form in all the right places its needed
parsed = addr_parse(row.household_street_address)
parsed["exploded_address"] = parsed.copy()
parsed["exploded_address"][
"household_street_address"
] = row.household_street_address
return parsed


def write_households_pii(output_rows, household_time):
shuffle(output_rows)
timestamp = household_time.strftime(TIMESTAMP_FMT)
hh_pii_path = Path("temp-data") / f"households_pii-{timestamp}.csv"
with open(
Path("temp-data") / f"households_pii-{timestamp}.csv",
hh_pii_path,
"w",
newline="",
encoding="utf-8",
) as house_csv:
print(f"Writing households PII to {hh_pii_path}")
writer = csv.writer(house_csv)
writer.writerow(HOUSEHOLD_PII_HEADERS)
for output_row in output_rows:
Expand Down
57 changes: 39 additions & 18 deletions households/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,10 @@ def addr_parse(addr):


# Python version of FRIL matchStreetName functionality
def address_distance(a1, a2):
# addr1 and addr2 are dicts that were returned from addr_parse
def address_distance(addr1, addr2):
score = 0
secondary_score = 0
# Need to parse because usaddress returns list of tuples without set indices
addr1 = addr_parse(a1)
addr2 = addr_parse(a2)
# Alternative way to parse usaddress.parse(a1) return (less efficient I think)
# addr_number_1 = next((v[0] for v in addr1 if v[1] == 'AddressNumber'), None)

# Change weights based on existence of second level address
if (
Expand Down Expand Up @@ -212,6 +208,8 @@ def address_distance(a1, a2):

# See if simple string compare of all things combined
# with a 0.6 adjustment is better
a1 = addr1["household_street_address"]
a2 = addr2["household_street_address"]
score = max(
score,
textdistance.jaro_winkler(a1, a2) * (weight_number + weight_street_name) * 0.6,
Expand Down Expand Up @@ -326,8 +324,8 @@ def get_household_matches(pii_lines, split_factor=4, debug=False):
)
compare_cl.add(
AddressComparison(
"household_street_address",
"household_street_address",
"exploded_address",
"exploded_address",
label="household_street_address",
)
)
Expand All @@ -341,19 +339,42 @@ def get_household_matches(pii_lines, split_factor=4, debug=False):
if debug:
print(f"[{datetime.now()}] Starting detailed comparison of indexed pairs")

features = compare_cl.compute(candidate_links, pii_lines)
matching_pairs = []
# we know that we could support len(subset_A) in memory above,
# so use the same amount here
len_subset_A = int(len(pii_lines) / split_factor)

# note: np.array_split had unexpectedly poor performance here for very large indices
for i in range(0, len(candidate_links), len_subset_A):
subset_links = candidate_links[i : i + len_subset_A]

# filtering the relevant pii lines before passing into compute() below
# seems to have a small positive impact on performance.
# subset_links is a MultiIndex so get the unique values from each level
# to get the overall relevant pii lines for this iteration
keys = set(subset_links.get_level_values(0)) | set(
subset_links.get_level_values(1)
)
relevant_pii_lines = pii_lines[pii_lines.index.isin(keys)]
if debug:
print(
f"[{datetime.now()}] Detailed comparing rows "
f"[{i}..{i + len_subset_A}]"
)

features = compare_cl.compute(subset_links, relevant_pii_lines)

features["family_name"] *= FN_WEIGHT
features["phone_number"] *= PHONE_WEIGHT
features["household_street_address"] *= ADDR_WEIGHT
features["household_zip"] *= ZIP_WEIGHT
features["family_name"] *= FN_WEIGHT
features["phone_number"] *= PHONE_WEIGHT
features["household_street_address"] *= ADDR_WEIGHT
features["household_zip"] *= ZIP_WEIGHT

# filter the matches down based on the cumulative score
matches = features[features.sum(axis=1) > MATCH_THRESHOLD]
# filter the matches down based on the cumulative score
matches = features[features.sum(axis=1) > MATCH_THRESHOLD]

matching_pairs = list(matches.index)
# matching pairs are bi-directional and not duplicated,
# ex if (1,9) is in the list then (9,1) won't be
matching_pairs.extend(list(matches.index))
# matching pairs are bi-directional and not duplicated,
# ex if (1,9) is in the list then (9,1) won't be

if debug:
print(f"[{datetime.now()}] Found {len(matching_pairs)} pairs")
Expand Down